summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile8
-rw-r--r--arch/x86/kernel/acpi/boot.c5
-rw-r--r--arch/x86/kernel/acpi/sleep.c11
-rw-r--r--arch/x86/kernel/acpi/sleep.h2
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S9
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S20
-rw-r--r--arch/x86/kernel/alternative.c301
-rw-r--r--arch/x86/kernel/amd_gart_64.c19
-rw-r--r--arch/x86/kernel/amd_nb.c6
-rw-r--r--arch/x86/kernel/apb_timer.c2
-rw-r--r--arch/x86/kernel/apic/apic.c381
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c66
-rw-r--r--arch/x86/kernel/apic/apic_noop.c18
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c8
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c33
-rw-r--r--arch/x86/kernel/apic/io_apic.c33
-rw-r--r--arch/x86/kernel/apic/ipi.c174
-rw-r--r--arch/x86/kernel/apic/local.h68
-rw-r--r--arch/x86/kernel/apic/msi.c128
-rw-r--r--arch/x86/kernel/apic/probe_32.c41
-rw-r--r--arch/x86/kernel/apic/probe_64.c21
-rw-r--r--arch/x86/kernel/apic/vector.c11
-rw-r--r--arch/x86/kernel/apic/x2apic.h9
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c23
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c23
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c183
-rw-r--r--arch/x86/kernel/asm-offsets_64.c21
-rw-r--r--arch/x86/kernel/cpu/Makefile8
-rw-r--r--arch/x86/kernel/cpu/amd.c106
-rw-r--r--arch/x86/kernel/cpu/bugs.c212
-rw-r--r--arch/x86/kernel/cpu/centaur.c37
-rw-r--r--arch/x86/kernel/cpu/common.c325
-rw-r--r--arch/x86/kernel/cpu/cpu.h22
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c97
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c145
-rw-r--r--arch/x86/kernel/cpu/hygon.c21
-rw-r--r--arch/x86/kernel/cpu/intel.c129
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c8
-rw-r--r--arch/x86/kernel/cpu/mce/core.c165
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c2
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c28
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h8
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c4
-rw-r--r--arch/x86/kernel/cpu/mce/therm_throt.c267
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c36
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c5
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh15
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c21
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c66
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c2
-rw-r--r--arch/x86/kernel/cpu/proc.c15
-rw-r--r--arch/x86/kernel/cpu/rdrand.c22
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c4
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h1
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c4
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c159
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/topology.c2
-rw-r--r--arch/x86/kernel/cpu/tsx.c144
-rw-r--r--arch/x86/kernel/cpu/umwait.c6
-rw-r--r--arch/x86/kernel/cpu/vmware.c96
-rw-r--r--arch/x86/kernel/cpu/zhaoxin.c37
-rw-r--r--arch/x86/kernel/crash.c132
-rw-r--r--arch/x86/kernel/crash_core_32.c17
-rw-r--r--arch/x86/kernel/crash_core_64.c24
-rw-r--r--arch/x86/kernel/crash_dump_64.c5
-rw-r--r--arch/x86/kernel/doublefault.c83
-rw-r--r--arch/x86/kernel/doublefault_32.c136
-rw-r--r--arch/x86/kernel/dumpstack.c33
-rw-r--r--arch/x86/kernel/dumpstack_32.c30
-rw-r--r--arch/x86/kernel/dumpstack_64.c7
-rw-r--r--arch/x86/kernel/e820.c23
-rw-r--r--arch/x86/kernel/early-quirks.c7
-rw-r--r--arch/x86/kernel/fpu/signal.c3
-rw-r--r--arch/x86/kernel/fpu/xstate.c40
-rw-r--r--arch/x86/kernel/ftrace.c689
-rw-r--r--arch/x86/kernel/ftrace_32.S23
-rw-r--r--arch/x86/kernel/ftrace_64.S89
-rw-r--r--arch/x86/kernel/head64.c22
-rw-r--r--arch/x86/kernel/head_32.S72
-rw-r--r--arch/x86/kernel/head_64.S113
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/ima_arch.c4
-rw-r--r--arch/x86/kernel/ioport.c216
-rw-r--r--arch/x86/kernel/irq.c10
-rw-r--r--arch/x86/kernel/irq_32.c7
-rw-r--r--arch/x86/kernel/irq_64.c9
-rw-r--r--arch/x86/kernel/irqflags.S8
-rw-r--r--arch/x86/kernel/jailhouse.c136
-rw-r--r--arch/x86/kernel/jump_label.c119
-rw-r--r--arch/x86/kernel/kdebugfs.c21
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c3
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kprobes/core.c27
-rw-r--r--arch/x86/kernel/kprobes/opt.c64
-rw-r--r--arch/x86/kernel/ksysfs.c31
-rw-r--r--arch/x86/kernel/kvm.c59
-rw-r--r--arch/x86/kernel/ldt.c83
-rw-r--r--arch/x86/kernel/machine_kexec_32.c16
-rw-r--r--arch/x86/kernel/machine_kexec_64.c66
-rw-r--r--arch/x86/kernel/msr.c8
-rw-r--r--arch/x86/kernel/nmi.c23
-rw-r--r--arch/x86/kernel/paravirt.c6
-rw-r--r--arch/x86/kernel/pci-calgary_64.c1584
-rw-r--r--arch/x86/kernel/pci-dma.c28
-rw-r--r--arch/x86/kernel/pci-swiotlb.c1
-rw-r--r--arch/x86/kernel/process.c211
-rw-r--r--arch/x86/kernel/process.h2
-rw-r--r--arch/x86/kernel/process_32.c78
-rw-r--r--arch/x86/kernel/process_64.c87
-rw-r--r--arch/x86/kernel/ptrace.c48
-rw-r--r--arch/x86/kernel/quirks.c6
-rw-r--r--arch/x86/kernel/reboot.c7
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S13
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S13
-rw-r--r--arch/x86/kernel/setup.c213
-rw-r--r--arch/x86/kernel/setup_percpu.c4
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/smp.c88
-rw-r--r--arch/x86/kernel/smpboot.c12
-rw-r--r--arch/x86/kernel/sys_x86_64.c9
-rw-r--r--arch/x86/kernel/sysfb_simplefb.c2
-rw-r--r--arch/x86/kernel/tboot.c17
-rw-r--r--arch/x86/kernel/tce_64.c177
-rw-r--r--arch/x86/kernel/time.c12
-rw-r--r--arch/x86/kernel/traps.c213
-rw-r--r--arch/x86/kernel/tsc.c5
-rw-r--r--arch/x86/kernel/tsc_msr.c5
-rw-r--r--arch/x86/kernel/tsc_sync.c9
-rw-r--r--arch/x86/kernel/umip.c81
-rw-r--r--arch/x86/kernel/unwind_orc.c11
-rw-r--r--arch/x86/kernel/uprobes.c19
-rw-r--r--arch/x86/kernel/verify_cpu.S4
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kernel/vmlinux.lds.S20
-rw-r--r--arch/x86/kernel/x86_init.c28
139 files changed, 4406 insertions, 4925 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3578ad248bc9..9b294c13809a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -94,13 +94,16 @@ obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
+obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o
obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-y += kprobes/
obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
+ifeq ($(CONFIG_X86_32),y)
+obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
+endif
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
@@ -134,7 +137,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
-obj-$(CONFIG_X86_INTEL_UMIP) += umip.o
+obj-$(CONFIG_X86_UMIP) += umip.o
obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
@@ -146,7 +149,6 @@ ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_AUDIT) += audit_64.o
obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
- obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 17b33ef604f3..04205ce127a1 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1760,6 +1760,11 @@ void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size)
e820__update_table_print();
}
+void x86_default_set_root_pointer(u64 addr)
+{
+ boot_params.acpi_rsdp_addr = addr;
+}
+
u64 x86_default_get_root_pointer(void)
{
return boot_params.acpi_rsdp_addr;
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ca13851f0570..26b7256f590f 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -27,6 +27,17 @@ static char temp_stack[4096];
#endif
/**
+ * acpi_get_wakeup_address - provide physical address for S3 wakeup
+ *
+ * Returns the physical address where the kernel should be resumed after the
+ * system awakes from S3, e.g. for programming into the firmware waking vector.
+ */
+unsigned long acpi_get_wakeup_address(void)
+{
+ return ((unsigned long)(real_mode_header->wakeup_start));
+}
+
+/**
* x86_acpi_enter_sleep_state - enter sleep state
* @state: Sleep state to enter.
*
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index fbb60ca4255c..d06c2079b6c1 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -3,7 +3,7 @@
* Variables and functions used by the code in sleep.c
*/
-#include <asm/realmode.h>
+#include <linux/linkage.h>
extern unsigned long saved_video_mode;
extern long saved_magic;
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index e95e95960156..daf88f8143c5 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -9,8 +9,7 @@
.code32
ALIGN
-ENTRY(wakeup_pmode_return)
-wakeup_pmode_return:
+SYM_CODE_START(wakeup_pmode_return)
movw $__KERNEL_DS, %ax
movw %ax, %ss
movw %ax, %fs
@@ -39,6 +38,7 @@ wakeup_pmode_return:
# jump to place where we left off
movl saved_eip, %eax
jmp *%eax
+SYM_CODE_END(wakeup_pmode_return)
bogus_magic:
jmp bogus_magic
@@ -72,7 +72,7 @@ restore_registers:
popfl
ret
-ENTRY(do_suspend_lowlevel)
+SYM_CODE_START(do_suspend_lowlevel)
call save_processor_state
call save_registers
pushl $3
@@ -87,10 +87,11 @@ ret_point:
call restore_registers
call restore_processor_state
ret
+SYM_CODE_END(do_suspend_lowlevel)
.data
ALIGN
-ENTRY(saved_magic) .long 0
+SYM_DATA(saved_magic, .long 0)
saved_eip: .long 0
# saved registers
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index b0715c3ac18d..c8daa92f38dc 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -14,12 +14,17 @@
/*
* Hooray, we are in Long 64-bit mode (but still running in low memory)
*/
-ENTRY(wakeup_long64)
+SYM_FUNC_START(wakeup_long64)
movq saved_magic, %rax
movq $0x123456789abcdef0, %rdx
cmpq %rdx, %rax
- jne bogus_64_magic
+ je 2f
+ /* stop here on a saved_magic mismatch */
+ movq $0xbad6d61676963, %rcx
+1:
+ jmp 1b
+2:
movw $__KERNEL_DS, %ax
movw %ax, %ss
movw %ax, %ds
@@ -35,12 +40,9 @@ ENTRY(wakeup_long64)
movq saved_rip, %rax
jmp *%rax
-ENDPROC(wakeup_long64)
+SYM_FUNC_END(wakeup_long64)
-bogus_64_magic:
- jmp bogus_64_magic
-
-ENTRY(do_suspend_lowlevel)
+SYM_FUNC_START(do_suspend_lowlevel)
FRAME_BEGIN
subq $8, %rsp
xorl %eax, %eax
@@ -123,7 +125,7 @@ ENTRY(do_suspend_lowlevel)
addq $8, %rsp
FRAME_END
jmp restore_processor_state
-ENDPROC(do_suspend_lowlevel)
+SYM_FUNC_END(do_suspend_lowlevel)
.data
saved_rbp: .quad 0
@@ -134,4 +136,4 @@ saved_rbx: .quad 0
saved_rip: .quad 0
saved_rsp: .quad 0
-ENTRY(saved_magic) .quad 0
+SYM_DATA(saved_magic, .quad 0)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ccd32013c47a..15ac0d5f4b40 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -23,6 +23,7 @@
#include <asm/nmi.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include <asm/insn.h>
#include <asm/io.h>
#include <asm/fixmap.h>
@@ -713,7 +714,7 @@ void __init alternative_instructions(void)
* Don't stop machine check exceptions while patching.
* MCEs only happen when something got corrupted and in this
* case we must do something about the corruption.
- * Ignoring it is worse than a unlikely patching race.
+ * Ignoring it is worse than an unlikely patching race.
* Also machine checks tend to be broadcast and if one CPU
* goes into machine check the others follow quickly, so we don't
* expect a machine check to cause undue problems during to code
@@ -753,8 +754,8 @@ void __init alternative_instructions(void)
* When you use this code to patch more than one byte of an instruction
* you need to make sure that other CPUs cannot execute this code in parallel.
* Also no thread must be currently preempted in the middle of these
- * instructions. And on the local CPU you need to be protected again NMI or MCE
- * handlers seeing an inconsistent instruction while you patch.
+ * instructions. And on the local CPU you need to be protected against NMI or
+ * MCE handlers seeing an inconsistent instruction while you patch.
*/
void __init_or_module text_poke_early(void *addr, const void *opcode,
size_t len)
@@ -936,74 +937,139 @@ static void do_sync_core(void *info)
sync_core();
}
-static struct bp_patching_desc {
+void text_poke_sync(void)
+{
+ on_each_cpu(do_sync_core, NULL, 1);
+}
+
+struct text_poke_loc {
+ s32 rel_addr; /* addr := _stext + rel_addr */
+ s32 rel32;
+ u8 opcode;
+ const u8 text[POKE_MAX_OPCODE_SIZE];
+};
+
+struct bp_patching_desc {
struct text_poke_loc *vec;
int nr_entries;
-} bp_patching;
+ atomic_t refs;
+};
+
+static struct bp_patching_desc *bp_desc;
-static int patch_cmp(const void *key, const void *elt)
+static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
+{
+ struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */
+
+ if (!desc || !atomic_inc_not_zero(&desc->refs))
+ return NULL;
+
+ return desc;
+}
+
+static inline void put_desc(struct bp_patching_desc *desc)
+{
+ smp_mb__before_atomic();
+ atomic_dec(&desc->refs);
+}
+
+static inline void *text_poke_addr(struct text_poke_loc *tp)
+{
+ return _stext + tp->rel_addr;
+}
+
+static int notrace patch_cmp(const void *key, const void *elt)
{
struct text_poke_loc *tp = (struct text_poke_loc *) elt;
- if (key < tp->addr)
+ if (key < text_poke_addr(tp))
return -1;
- if (key > tp->addr)
+ if (key > text_poke_addr(tp))
return 1;
return 0;
}
NOKPROBE_SYMBOL(patch_cmp);
-int poke_int3_handler(struct pt_regs *regs)
+int notrace poke_int3_handler(struct pt_regs *regs)
{
+ struct bp_patching_desc *desc;
struct text_poke_loc *tp;
- unsigned char int3 = 0xcc;
+ int len, ret = 0;
void *ip;
+ if (user_mode(regs))
+ return 0;
+
/*
* Having observed our INT3 instruction, we now must observe
- * bp_patching.nr_entries.
- *
- * nr_entries != 0 INT3
- * WMB RMB
- * write INT3 if (nr_entries)
+ * bp_desc:
*
- * Idem for other elements in bp_patching.
+ * bp_desc = desc INT3
+ * WMB RMB
+ * write INT3 if (desc)
*/
smp_rmb();
- if (likely(!bp_patching.nr_entries))
- return 0;
-
- if (user_mode(regs))
+ desc = try_get_desc(&bp_desc);
+ if (!desc)
return 0;
/*
- * Discount the sizeof(int3). See text_poke_bp_batch().
+ * Discount the INT3. See text_poke_bp_batch().
*/
- ip = (void *) regs->ip - sizeof(int3);
+ ip = (void *) regs->ip - INT3_INSN_SIZE;
/*
* Skip the binary search if there is a single member in the vector.
*/
- if (unlikely(bp_patching.nr_entries > 1)) {
- tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries,
+ if (unlikely(desc->nr_entries > 1)) {
+ tp = bsearch(ip, desc->vec, desc->nr_entries,
sizeof(struct text_poke_loc),
patch_cmp);
if (!tp)
- return 0;
+ goto out_put;
} else {
- tp = bp_patching.vec;
- if (tp->addr != ip)
- return 0;
+ tp = desc->vec;
+ if (text_poke_addr(tp) != ip)
+ goto out_put;
}
- /* set up the specified breakpoint detour */
- regs->ip = (unsigned long) tp->detour;
+ len = text_opcode_size(tp->opcode);
+ ip += len;
- return 1;
+ switch (tp->opcode) {
+ case INT3_INSN_OPCODE:
+ /*
+ * Someone poked an explicit INT3, they'll want to handle it,
+ * do not consume.
+ */
+ goto out_put;
+
+ case CALL_INSN_OPCODE:
+ int3_emulate_call(regs, (long)ip + tp->rel32);
+ break;
+
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ int3_emulate_jmp(regs, (long)ip + tp->rel32);
+ break;
+
+ default:
+ BUG();
+ }
+
+ ret = 1;
+
+out_put:
+ put_desc(desc);
+ return ret;
}
NOKPROBE_SYMBOL(poke_int3_handler);
+#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
+static struct text_poke_loc tp_vec[TP_VEC_MAX];
+static int tp_vec_nr;
+
/**
* text_poke_bp_batch() -- update instructions on live kernel on SMP
* @tp: vector of instructions to patch
@@ -1014,7 +1080,7 @@ NOKPROBE_SYMBOL(poke_int3_handler);
* synchronization using int3 breakpoint.
*
* The way it is done:
- * - For each entry in the vector:
+ * - For each entry in the vector:
* - add a int3 trap to the address that will be patched
* - sync cores
* - For each entry in the vector:
@@ -1025,16 +1091,20 @@ NOKPROBE_SYMBOL(poke_int3_handler);
* replacing opcode
* - sync cores
*/
-void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
+static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
{
- int patched_all_but_first = 0;
- unsigned char int3 = 0xcc;
+ struct bp_patching_desc desc = {
+ .vec = tp,
+ .nr_entries = nr_entries,
+ .refs = ATOMIC_INIT(1),
+ };
+ unsigned char int3 = INT3_INSN_OPCODE;
unsigned int i;
+ int do_sync;
lockdep_assert_held(&text_mutex);
- bp_patching.vec = tp;
- bp_patching.nr_entries = nr_entries;
+ smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */
/*
* Corresponding read barrier in int3 notifier for making sure the
@@ -1046,45 +1116,153 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
* First step: add a int3 trap to the address that will be patched.
*/
for (i = 0; i < nr_entries; i++)
- text_poke(tp[i].addr, &int3, sizeof(int3));
+ text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
- on_each_cpu(do_sync_core, NULL, 1);
+ text_poke_sync();
/*
* Second step: update all but the first byte of the patched range.
*/
- for (i = 0; i < nr_entries; i++) {
- if (tp[i].len - sizeof(int3) > 0) {
- text_poke((char *)tp[i].addr + sizeof(int3),
- (const char *)tp[i].opcode + sizeof(int3),
- tp[i].len - sizeof(int3));
- patched_all_but_first++;
+ for (do_sync = 0, i = 0; i < nr_entries; i++) {
+ int len = text_opcode_size(tp[i].opcode);
+
+ if (len - INT3_INSN_SIZE > 0) {
+ text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
+ (const char *)tp[i].text + INT3_INSN_SIZE,
+ len - INT3_INSN_SIZE);
+ do_sync++;
}
}
- if (patched_all_but_first) {
+ if (do_sync) {
/*
* According to Intel, this core syncing is very likely
* not necessary and we'd be safe even without it. But
* better safe than sorry (plus there's not only Intel).
*/
- on_each_cpu(do_sync_core, NULL, 1);
+ text_poke_sync();
}
/*
* Third step: replace the first byte (int3) by the first byte of
* replacing opcode.
*/
- for (i = 0; i < nr_entries; i++)
- text_poke(tp[i].addr, tp[i].opcode, sizeof(int3));
+ for (do_sync = 0, i = 0; i < nr_entries; i++) {
+ if (tp[i].text[0] == INT3_INSN_OPCODE)
+ continue;
+
+ text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE);
+ do_sync++;
+ }
+
+ if (do_sync)
+ text_poke_sync();
- on_each_cpu(do_sync_core, NULL, 1);
/*
- * sync_core() implies an smp_mb() and orders this store against
- * the writing of the new instruction.
+ * Remove and synchronize_rcu(), except we have a very primitive
+ * refcount based completion.
*/
- bp_patching.vec = NULL;
- bp_patching.nr_entries = 0;
+ WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */
+ if (!atomic_dec_and_test(&desc.refs))
+ atomic_cond_read_acquire(&desc.refs, !VAL);
+}
+
+void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
+ const void *opcode, size_t len, const void *emulate)
+{
+ struct insn insn;
+
+ memcpy((void *)tp->text, opcode, len);
+ if (!emulate)
+ emulate = opcode;
+
+ kernel_insn_init(&insn, emulate, MAX_INSN_SIZE);
+ insn_get_length(&insn);
+
+ BUG_ON(!insn_complete(&insn));
+ BUG_ON(len != insn.length);
+
+ tp->rel_addr = addr - (void *)_stext;
+ tp->opcode = insn.opcode.bytes[0];
+
+ switch (tp->opcode) {
+ case INT3_INSN_OPCODE:
+ break;
+
+ case CALL_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ tp->rel32 = insn.immediate.value;
+ break;
+
+ default: /* assume NOP */
+ switch (len) {
+ case 2: /* NOP2 -- emulate as JMP8+0 */
+ BUG_ON(memcmp(emulate, ideal_nops[len], len));
+ tp->opcode = JMP8_INSN_OPCODE;
+ tp->rel32 = 0;
+ break;
+
+ case 5: /* NOP5 -- emulate as JMP32+0 */
+ BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len));
+ tp->opcode = JMP32_INSN_OPCODE;
+ tp->rel32 = 0;
+ break;
+
+ default: /* unknown instruction */
+ BUG();
+ }
+ break;
+ }
+}
+
+/*
+ * We hard rely on the tp_vec being ordered; ensure this is so by flushing
+ * early if needed.
+ */
+static bool tp_order_fail(void *addr)
+{
+ struct text_poke_loc *tp;
+
+ if (!tp_vec_nr)
+ return false;
+
+ if (!addr) /* force */
+ return true;
+
+ tp = &tp_vec[tp_vec_nr - 1];
+ if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
+ return true;
+
+ return false;
+}
+
+static void text_poke_flush(void *addr)
+{
+ if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
+ text_poke_bp_batch(tp_vec, tp_vec_nr);
+ tp_vec_nr = 0;
+ }
+}
+
+void text_poke_finish(void)
+{
+ text_poke_flush(NULL);
+}
+
+void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
+{
+ struct text_poke_loc *tp;
+
+ if (unlikely(system_state == SYSTEM_BOOTING)) {
+ text_poke_early(addr, opcode, len);
+ return;
+ }
+
+ text_poke_flush(addr);
+
+ tp = &tp_vec[tp_vec_nr++];
+ text_poke_loc_init(tp, addr, opcode, len, emulate);
}
/**
@@ -1098,20 +1276,15 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
* dynamically allocated memory. This function should be used when it is
* not possible to allocate memory.
*/
-void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
+void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
{
- struct text_poke_loc tp = {
- .detour = handler,
- .addr = addr,
- .len = len,
- };
+ struct text_poke_loc tp;
- if (len > POKE_MAX_OPCODE_SIZE) {
- WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE);
+ if (unlikely(system_state == SYSTEM_BOOTING)) {
+ text_poke_early(addr, opcode, len);
return;
}
- memcpy((void *)tp.opcode, opcode, len);
-
+ text_poke_loc_init(&tp, addr, opcode, len, emulate);
text_poke_bp_batch(&tp, 1);
}
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index a585ea6f686a..4e5f50236048 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -185,13 +185,13 @@ static void iommu_full(struct device *dev, size_t size, int dir)
static inline int
need_iommu(struct device *dev, unsigned long addr, size_t size)
{
- return force_iommu || !dma_capable(dev, addr, size);
+ return force_iommu || !dma_capable(dev, addr, size, true);
}
static inline int
nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
{
- return !dma_capable(dev, addr, size);
+ return !dma_capable(dev, addr, size, true);
}
/* Map a single continuous physical area into the IOMMU.
@@ -510,10 +510,9 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
if (iommu_size < 64*1024*1024) {
- pr_warning(
- "PCI-DMA: Warning: Small IOMMU %luMB."
+ pr_warn("PCI-DMA: Warning: Small IOMMU %luMB."
" Consider increasing the AGP aperture in BIOS\n",
- iommu_size >> 20);
+ iommu_size >> 20);
}
return iommu_size;
@@ -665,8 +664,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
nommu:
/* Should not happen anymore */
- pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
- "falling back to iommu=soft.\n");
+ pr_warn("PCI-DMA: More than 4GB of RAM and no IOMMU - falling back to iommu=soft.\n");
return -1;
}
@@ -677,7 +675,10 @@ static const struct dma_map_ops gart_dma_ops = {
.unmap_page = gart_unmap_page,
.alloc = gart_alloc_coherent,
.free = gart_free_coherent,
+ .mmap = dma_common_mmap,
+ .get_sgtable = dma_common_get_sgtable,
.dma_supported = dma_direct_supported,
+ .get_required_mask = dma_direct_get_required_mask,
};
static void gart_iommu_shutdown(void)
@@ -730,8 +731,8 @@ int __init gart_iommu_init(void)
!gart_iommu_aperture ||
(no_agp && init_amd_gatt(&info) < 0)) {
if (max_pfn > MAX_DMA32_PFN) {
- pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
- pr_warning("falling back to iommu=soft.\n");
+ pr_warn("More than 4GB of memory but GART IOMMU not available.\n");
+ pr_warn("falling back to iommu=soft.\n");
}
return 0;
}
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index d63e63b7d1d9..69aed0ebbdfc 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -21,6 +21,8 @@
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
+#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
+#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
/* Protect the PCI config register pairs used for SMN and DF indirect access. */
static DEFINE_MUTEX(smn_mutex);
@@ -50,6 +52,8 @@ const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
{}
};
EXPORT_SYMBOL_GPL(amd_nb_misc_ids);
@@ -63,6 +67,8 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
{}
};
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 5da106f84e84..fe698f96617c 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -95,7 +95,7 @@ static inline void apbt_set_mapping(void)
printk(KERN_WARNING "No timer base from SFI, use default\n");
apbt_address = APBT_DEFAULT_BASE;
}
- apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
+ apbt_virt_address = ioremap(apbt_address, APBT_MMAP_SIZE);
if (!apbt_virt_address) {
pr_debug("Failed mapping APBT phy address at %lu\n",\
(unsigned long)apbt_address);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f5291362da1a..5f973fed3c9f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -65,10 +65,10 @@ unsigned int num_processors;
unsigned disabled_cpus;
/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
+unsigned int boot_cpu_physical_apicid __ro_after_init = -1U;
EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
-u8 boot_cpu_apic_version;
+u8 boot_cpu_apic_version __ro_after_init;
/*
* The highest APIC ID seen during enumeration.
@@ -85,13 +85,13 @@ physid_mask_t phys_cpu_present_map;
* disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
* avoid undefined behaviour caused by sending INIT from AP to BSP.
*/
-static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID;
+static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID;
/*
* This variable controls which CPUs receive external NMIs. By default,
* external NMIs are delivered only to the BSP.
*/
-static int apic_extnmi = APIC_EXTNMI_BSP;
+static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP;
/*
* Map cpu index to physical APIC ID
@@ -114,7 +114,7 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID);
/* Local APIC was disabled by the BIOS and enabled by the kernel */
-static int enabled_via_apicbase;
+static int enabled_via_apicbase __ro_after_init;
/*
* Handle interrupt mode configuration register (IMCR).
@@ -172,23 +172,23 @@ static __init int setup_apicpmtimer(char *s)
__setup("apicpmtimer", setup_apicpmtimer);
#endif
-unsigned long mp_lapic_addr;
-int disable_apic;
+unsigned long mp_lapic_addr __ro_after_init;
+int disable_apic __ro_after_init;
/* Disable local APIC timer from the kernel commandline or via dmi quirk */
static int disable_apic_timer __initdata;
/* Local APIC timer works in C2 */
-int local_apic_timer_c2_ok;
+int local_apic_timer_c2_ok __ro_after_init;
EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
/*
* Debug level, exported for io_apic.c
*/
-int apic_verbosity;
+int apic_verbosity __ro_after_init;
-int pic_mode;
+int pic_mode __ro_after_init;
/* Have we found an MP table */
-int smp_found_config;
+int smp_found_config __ro_after_init;
static struct resource lapic_resource = {
.name = "Local APIC",
@@ -199,7 +199,7 @@ unsigned int lapic_timer_period = 0;
static void apic_pm_activate(void);
-static unsigned long apic_phys;
+static unsigned long apic_phys __ro_after_init;
/*
* Get the LAPIC version
@@ -590,21 +590,21 @@ static u32 skx_deadline_rev(void)
static const struct x86_cpu_id deadline_match[] = {
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev),
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020),
- DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev),
+ DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_D, bdx_deadline_rev),
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_GT3E, 0x17),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL, 0x22),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_L, 0x20),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_G, 0x17),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_CORE, 0x25),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_GT3E, 0x17),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL, 0x25),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_G, 0x17),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_MOBILE, 0xb2),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_DESKTOP, 0xb2),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_L, 0xb2),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE, 0xb2),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_MOBILE, 0x52),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_DESKTOP, 0x52),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_L, 0x52),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE, 0x52),
{},
};
@@ -722,7 +722,7 @@ static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
/*
- * Temporary interrupt handler.
+ * Temporary interrupt handler and polled calibration function.
*/
static void __init lapic_cal_handler(struct clock_event_device *dev)
{
@@ -780,8 +780,8 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
res = (((u64)deltapm) * mult) >> 22;
do_div(res, 1000000);
- pr_warning("APIC calibration not consistent "
- "with PM-Timer: %ldms instead of 100ms\n",(long)res);
+ pr_warn("APIC calibration not consistent "
+ "with PM-Timer: %ldms instead of 100ms\n", (long)res);
/* Correct the lapic counter value */
res = (((u64)(*delta)) * pm_100ms);
@@ -830,8 +830,21 @@ bool __init apic_needs_pit(void)
if (!tsc_khz || !cpu_khz)
return true;
- /* Is there an APIC at all? */
- if (!boot_cpu_has(X86_FEATURE_APIC))
+ /* Is there an APIC at all or is it disabled? */
+ if (!boot_cpu_has(X86_FEATURE_APIC) || disable_apic)
+ return true;
+
+ /*
+ * If interrupt delivery mode is legacy PIC or virtual wire without
+ * configuration, the local APIC timer wont be set up. Make sure
+ * that the PIT is initialized.
+ */
+ if (apic_intr_mode == APIC_PIC ||
+ apic_intr_mode == APIC_VIRTUAL_WIRE_NO_CONFIG)
+ return true;
+
+ /* Virt guests may lack ARAT, but still have DEADLINE */
+ if (!boot_cpu_has(X86_FEATURE_ARAT))
return true;
/* Deadline timer is based on TSC so no further PIT action required */
@@ -851,7 +864,8 @@ bool __init apic_needs_pit(void)
static int __init calibrate_APIC_clock(void)
{
struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
- void (*real_handler)(struct clock_event_device *dev);
+ u64 tsc_perj = 0, tsc_start = 0;
+ unsigned long jif_start;
unsigned long deltaj;
long delta, deltatsc;
int pm_referenced = 0;
@@ -878,28 +892,64 @@ static int __init calibrate_APIC_clock(void)
apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
"calibrating APIC timer ...\n");
+ /*
+ * There are platforms w/o global clockevent devices. Instead of
+ * making the calibration conditional on that, use a polling based
+ * approach everywhere.
+ */
local_irq_disable();
- /* Replace the global interrupt handler */
- real_handler = global_clock_event->event_handler;
- global_clock_event->event_handler = lapic_cal_handler;
-
/*
* Setup the APIC counter to maximum. There is no way the lapic
* can underflow in the 100ms detection time frame
*/
__setup_APIC_LVTT(0xffffffff, 0, 0);
- /* Let the interrupts run */
+ /*
+ * Methods to terminate the calibration loop:
+ * 1) Global clockevent if available (jiffies)
+ * 2) TSC if available and frequency is known
+ */
+ jif_start = READ_ONCE(jiffies);
+
+ if (tsc_khz) {
+ tsc_start = rdtsc();
+ tsc_perj = div_u64((u64)tsc_khz * 1000, HZ);
+ }
+
+ /*
+ * Enable interrupts so the tick can fire, if a global
+ * clockevent device is available
+ */
local_irq_enable();
- while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
- cpu_relax();
+ while (lapic_cal_loops <= LAPIC_CAL_LOOPS) {
+ /* Wait for a tick to elapse */
+ while (1) {
+ if (tsc_khz) {
+ u64 tsc_now = rdtsc();
+ if ((tsc_now - tsc_start) >= tsc_perj) {
+ tsc_start += tsc_perj;
+ break;
+ }
+ } else {
+ unsigned long jif_now = READ_ONCE(jiffies);
- local_irq_disable();
+ if (time_after(jif_now, jif_start)) {
+ jif_start = jif_now;
+ break;
+ }
+ }
+ cpu_relax();
+ }
- /* Restore the real event handler */
- global_clock_event->event_handler = real_handler;
+ /* Invoke the calibration routine */
+ local_irq_disable();
+ lapic_cal_handler(NULL);
+ local_irq_enable();
+ }
+
+ local_irq_disable();
/* Build delta t1-t2 as apic timer counts down */
delta = lapic_cal_t1 - lapic_cal_t2;
@@ -936,17 +986,18 @@ static int __init calibrate_APIC_clock(void)
*/
if (lapic_timer_period < (1000000 / HZ)) {
local_irq_enable();
- pr_warning("APIC frequency too slow, disabling apic timer\n");
+ pr_warn("APIC frequency too slow, disabling apic timer\n");
return -1;
}
levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
/*
- * PM timer calibration failed or not turned on
- * so lets try APIC timer based calibration
+ * PM timer calibration failed or not turned on so lets try APIC
+ * timer based calibration, if a global clockevent device is
+ * available.
*/
- if (!pm_referenced) {
+ if (!pm_referenced && global_clock_event) {
apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
/*
@@ -979,7 +1030,7 @@ static int __init calibrate_APIC_clock(void)
local_irq_enable();
if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
- pr_warning("APIC timer disabled due to verification failure\n");
+ pr_warn("APIC timer disabled due to verification failure\n");
return -1;
}
@@ -1053,8 +1104,8 @@ static void local_apic_timer_interrupt(void)
* spurious.
*/
if (!evt->event_handler) {
- pr_warning("Spurious LAPIC timer interrupt on cpu %d\n",
- smp_processor_id());
+ pr_warn("Spurious LAPIC timer interrupt on cpu %d\n",
+ smp_processor_id());
/* Switch it off */
lapic_timer_shutdown(evt);
return;
@@ -1182,25 +1233,38 @@ void clear_local_APIC(void)
}
/**
- * disable_local_APIC - clear and disable the local APIC
+ * apic_soft_disable - Clears and software disables the local APIC on hotplug
+ *
+ * Contrary to disable_local_APIC() this does not touch the enable bit in
+ * MSR_IA32_APICBASE. Clearing that bit on systems based on the 3 wire APIC
+ * bus would require a hardware reset as the APIC would lose track of bus
+ * arbitration. On systems with FSB delivery APICBASE could be disabled,
+ * but it has to be guaranteed that no interrupt is sent to the APIC while
+ * in that state and it's not clear from the SDM whether it still responds
+ * to INIT/SIPI messages. Stay on the safe side and use software disable.
*/
-void disable_local_APIC(void)
+void apic_soft_disable(void)
{
- unsigned int value;
-
- /* APIC hasn't been mapped yet */
- if (!x2apic_mode && !apic_phys)
- return;
+ u32 value;
clear_local_APIC();
- /*
- * Disable APIC (implies clearing of registers
- * for 82489DX!).
- */
+ /* Soft disable APIC (implies clearing of registers for 82489DX!). */
value = apic_read(APIC_SPIV);
value &= ~APIC_SPIV_APIC_ENABLED;
apic_write(APIC_SPIV, value);
+}
+
+/**
+ * disable_local_APIC - clear and disable the local APIC
+ */
+void disable_local_APIC(void)
+{
+ /* APIC hasn't been mapped yet */
+ if (!x2apic_mode && !apic_phys)
+ return;
+
+ apic_soft_disable();
#ifdef CONFIG_X86_32
/*
@@ -1265,9 +1329,9 @@ void __init sync_Arb_IDs(void)
APIC_INT_LEVELTRIG | APIC_DM_INIT);
}
-enum apic_intr_mode_id apic_intr_mode;
+enum apic_intr_mode_id apic_intr_mode __ro_after_init;
-static int __init apic_intr_mode_select(void)
+static int __init __apic_intr_mode_select(void)
{
/* Check kernel option */
if (disable_apic) {
@@ -1329,6 +1393,12 @@ static int __init apic_intr_mode_select(void)
return APIC_SYMMETRIC_IO;
}
+/* Select the interrupt delivery mode for the BSP */
+void __init apic_intr_mode_select(void)
+{
+ apic_intr_mode = __apic_intr_mode_select();
+}
+
/*
* An initial setup of the virtual wire mode.
*/
@@ -1385,8 +1455,6 @@ void __init apic_intr_mode_init(void)
{
bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT);
- apic_intr_mode = apic_intr_mode_select();
-
switch (apic_intr_mode) {
case APIC_PIC:
pr_info("APIC: Keep in PIC mode(8259)\n");
@@ -1453,54 +1521,72 @@ static void lapic_setup_esr(void)
oldvalue, value);
}
-static void apic_pending_intr_clear(void)
+#define APIC_IR_REGS APIC_ISR_NR
+#define APIC_IR_BITS (APIC_IR_REGS * 32)
+#define APIC_IR_MAPSIZE (APIC_IR_BITS / BITS_PER_LONG)
+
+union apic_ir {
+ unsigned long map[APIC_IR_MAPSIZE];
+ u32 regs[APIC_IR_REGS];
+};
+
+static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr)
{
- long long max_loops = cpu_khz ? cpu_khz : 1000000;
- unsigned long long tsc = 0, ntsc;
- unsigned int queued;
- unsigned long value;
- int i, j, acked = 0;
+ int i, bit;
+
+ /* Read the IRRs */
+ for (i = 0; i < APIC_IR_REGS; i++)
+ irr->regs[i] = apic_read(APIC_IRR + i * 0x10);
+
+ /* Read the ISRs */
+ for (i = 0; i < APIC_IR_REGS; i++)
+ isr->regs[i] = apic_read(APIC_ISR + i * 0x10);
- if (boot_cpu_has(X86_FEATURE_TSC))
- tsc = rdtsc();
/*
- * After a crash, we no longer service the interrupts and a pending
- * interrupt from previous kernel might still have ISR bit set.
- *
- * Most probably by now CPU has serviced that pending interrupt and
- * it might not have done the ack_APIC_irq() because it thought,
- * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
- * does not clear the ISR bit and cpu thinks it has already serivced
- * the interrupt. Hence a vector might get locked. It was noticed
- * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
+ * If the ISR map is not empty. ACK the APIC and run another round
+ * to verify whether a pending IRR has been unblocked and turned
+ * into a ISR.
*/
- do {
- queued = 0;
- for (i = APIC_ISR_NR - 1; i >= 0; i--)
- queued |= apic_read(APIC_IRR + i*0x10);
-
- for (i = APIC_ISR_NR - 1; i >= 0; i--) {
- value = apic_read(APIC_ISR + i*0x10);
- for_each_set_bit(j, &value, 32) {
- ack_APIC_irq();
- acked++;
- }
- }
- if (acked > 256) {
- pr_err("LAPIC pending interrupts after %d EOI\n", acked);
- break;
- }
- if (queued) {
- if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) {
- ntsc = rdtsc();
- max_loops = (long long)cpu_khz << 10;
- max_loops -= ntsc - tsc;
- } else {
- max_loops--;
- }
- }
- } while (queued && max_loops > 0);
- WARN_ON(max_loops <= 0);
+ if (!bitmap_empty(isr->map, APIC_IR_BITS)) {
+ /*
+ * There can be multiple ISR bits set when a high priority
+ * interrupt preempted a lower priority one. Issue an ACK
+ * per set bit.
+ */
+ for_each_set_bit(bit, isr->map, APIC_IR_BITS)
+ ack_APIC_irq();
+ return true;
+ }
+
+ return !bitmap_empty(irr->map, APIC_IR_BITS);
+}
+
+/*
+ * After a crash, we no longer service the interrupts and a pending
+ * interrupt from previous kernel might still have ISR bit set.
+ *
+ * Most probably by now the CPU has serviced that pending interrupt and it
+ * might not have done the ack_APIC_irq() because it thought, interrupt
+ * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear
+ * the ISR bit and cpu thinks it has already serivced the interrupt. Hence
+ * a vector might get locked. It was noticed for timer irq (vector
+ * 0x31). Issue an extra EOI to clear ISR.
+ *
+ * If there are pending IRR bits they turn into ISR bits after a higher
+ * priority ISR bit has been acked.
+ */
+static void apic_pending_intr_clear(void)
+{
+ union apic_ir irr, isr;
+ unsigned int i;
+
+ /* 512 loops are way oversized and give the APIC a chance to obey. */
+ for (i = 0; i < 512; i++) {
+ if (!apic_check_and_ack(&irr, &isr))
+ return;
+ }
+ /* Dump the IRR/ISR content if that failed */
+ pr_warn("APIC: Stale IRR: %256pb ISR: %256pb\n", irr.map, isr.map);
}
/**
@@ -1513,16 +1599,20 @@ static void setup_local_APIC(void)
{
int cpu = smp_processor_id();
unsigned int value;
-#ifdef CONFIG_X86_32
- int logical_apicid, ldr_apicid;
-#endif
-
if (disable_apic) {
disable_ioapic_support();
return;
}
+ /*
+ * If this comes from kexec/kcrash the APIC might be enabled in
+ * SPIV. Soft disable it before doing further initialization.
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_SPIV_APIC_ENABLED;
+ apic_write(APIC_SPIV, value);
+
#ifdef CONFIG_X86_32
/* Pound the ESR really hard over the head with a big hammer - mbligh */
if (lapic_is_integrated() && apic->disable_esr) {
@@ -1532,8 +1622,6 @@ static void setup_local_APIC(void)
apic_write(APIC_ESR, 0);
}
#endif
- perf_events_lapic_init();
-
/*
* Double-check whether this APIC is really registered.
* This is meaningless in clustered apic mode, so we skip it.
@@ -1548,26 +1636,35 @@ static void setup_local_APIC(void)
apic->init_apic_ldr();
#ifdef CONFIG_X86_32
- /*
- * APIC LDR is initialized. If logical_apicid mapping was
- * initialized during get_smp_config(), make sure it matches the
- * actual value.
- */
- logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
- ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
- WARN_ON(logical_apicid != BAD_APICID && logical_apicid != ldr_apicid);
- /* always use the value from LDR */
- early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid;
+ if (apic->dest_logical) {
+ int logical_apicid, ldr_apicid;
+
+ /*
+ * APIC LDR is initialized. If logical_apicid mapping was
+ * initialized during get_smp_config(), make sure it matches
+ * the actual value.
+ */
+ logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+ ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
+ if (logical_apicid != BAD_APICID)
+ WARN_ON(logical_apicid != ldr_apicid);
+ /* Always use the value from LDR. */
+ early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid;
+ }
#endif
/*
- * Set Task Priority to 'accept all'. We never change this
- * later on.
+ * Set Task Priority to 'accept all except vectors 0-31'. An APIC
+ * vector in the 16-31 range could be delivered if TPR == 0, but we
+ * would think it's an exception and terrible things will happen. We
+ * never change this later on.
*/
value = apic_read(APIC_TASKPRI);
value &= ~APIC_TPRI_MASK;
+ value |= 0x10;
apic_write(APIC_TASKPRI, value);
+ /* Clear eventually stale ISR/IRR bits */
apic_pending_intr_clear();
/*
@@ -1614,6 +1711,8 @@ static void setup_local_APIC(void)
value |= SPURIOUS_APIC_VECTOR;
apic_write(APIC_SPIV, value);
+ perf_events_lapic_init();
+
/*
* Set up LVT0, LVT1:
*
@@ -1725,11 +1824,11 @@ static int __init setup_nox2apic(char *str)
int apicid = native_apic_msr_read(APIC_ID);
if (apicid >= 255) {
- pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
- apicid);
+ pr_warn("Apicid: %08x, cannot enforce nox2apic\n",
+ apicid);
return 0;
}
- pr_warning("x2apic already enabled.\n");
+ pr_warn("x2apic already enabled.\n");
__x2apic_disable();
}
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
@@ -1897,7 +1996,7 @@ static int __init apic_verify(void)
*/
features = cpuid_edx(1);
if (!(features & (1 << X86_FEATURE_APIC))) {
- pr_warning("Could not enable APIC!\n");
+ pr_warn("Could not enable APIC!\n");
return -1;
}
set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
@@ -2251,7 +2350,7 @@ static int cpuid_to_apicid[] = {
#ifdef CONFIG_SMP
/**
* apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread
- * @id: APIC ID to check
+ * @apicid: APIC ID to check
*/
bool apic_id_is_primary_thread(unsigned int apicid)
{
@@ -2324,9 +2423,8 @@ int generic_processor_info(int apicid, int version)
disabled_cpu_apicid == apicid) {
int thiscpu = num_processors + disabled_cpus;
- pr_warning("APIC: Disabling requested cpu."
- " Processor %d/0x%x ignored.\n",
- thiscpu, apicid);
+ pr_warn("APIC: Disabling requested cpu."
+ " Processor %d/0x%x ignored.\n", thiscpu, apicid);
disabled_cpus++;
return -ENODEV;
@@ -2340,8 +2438,7 @@ int generic_processor_info(int apicid, int version)
apicid != boot_cpu_physical_apicid) {
int thiscpu = max + disabled_cpus - 1;
- pr_warning(
- "APIC: NR_CPUS/possible_cpus limit of %i almost"
+ pr_warn("APIC: NR_CPUS/possible_cpus limit of %i almost"
" reached. Keeping one slot for boot cpu."
" Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
@@ -2352,9 +2449,8 @@ int generic_processor_info(int apicid, int version)
if (num_processors >= nr_cpu_ids) {
int thiscpu = max + disabled_cpus;
- pr_warning("APIC: NR_CPUS/possible_cpus limit of %i "
- "reached. Processor %d/0x%x ignored.\n",
- max, thiscpu, apicid);
+ pr_warn("APIC: NR_CPUS/possible_cpus limit of %i reached. "
+ "Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
disabled_cpus++;
return -EINVAL;
@@ -2384,13 +2480,13 @@ int generic_processor_info(int apicid, int version)
* Validate version
*/
if (version == 0x0) {
- pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
- cpu, apicid);
+ pr_warn("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
+ cpu, apicid);
version = 0x10;
}
if (version != boot_cpu_apic_version) {
- pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
+ pr_warn("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
boot_cpu_apic_version, cpu, version);
}
@@ -2543,6 +2639,13 @@ static int lapic_suspend(void)
#endif
local_irq_save(flags);
+
+ /*
+ * Mask IOAPIC before disabling the local APIC to prevent stale IRR
+ * entries on some implementations.
+ */
+ mask_ioapic_entries();
+
disable_local_APIC();
irq_remapping_disable();
@@ -2759,7 +2862,7 @@ static int __init apic_set_verbosity(char *arg)
apic_verbosity = APIC_VERBOSE;
#ifdef CONFIG_X86_64
else {
- pr_warning("APIC Verbosity level %s not recognised"
+ pr_warn("APIC Verbosity level %s not recognised"
" use apic=verbose or apic=debug\n", arg);
return -EINVAL;
}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index bbdca603f94a..7862b152a052 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -8,21 +8,14 @@
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
-#include <linux/acpi.h>
-#include <linux/errno.h>
-#include <linux/threads.h>
#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/hardirq.h>
#include <linux/export.h>
+#include <linux/acpi.h>
-#include <asm/smp.h>
-#include <asm/ipi.h>
-#include <asm/apic.h>
-#include <asm/apic_flat_64.h>
#include <asm/jailhouse_para.h>
+#include <asm/apic.h>
+
+#include "local.h"
static struct apic apic_physflat;
static struct apic apic_flat;
@@ -83,35 +76,6 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
_flat_send_IPI_mask(mask, vector);
}
-static void flat_send_IPI_allbutself(int vector)
-{
- int cpu = smp_processor_id();
-
- if (IS_ENABLED(CONFIG_HOTPLUG_CPU) || vector == NMI_VECTOR) {
- if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
- unsigned long mask = cpumask_bits(cpu_online_mask)[0];
-
- if (cpu < BITS_PER_LONG)
- __clear_bit(cpu, &mask);
-
- _flat_send_IPI_mask(mask, vector);
- }
- } else if (num_online_cpus() > 1) {
- __default_send_IPI_shortcut(APIC_DEST_ALLBUT,
- vector, apic->dest_logical);
- }
-}
-
-static void flat_send_IPI_all(int vector)
-{
- if (vector == NMI_VECTOR) {
- flat_send_IPI_mask(cpu_online_mask, vector);
- } else {
- __default_send_IPI_shortcut(APIC_DEST_ALLINC,
- vector, apic->dest_logical);
- }
-}
-
static unsigned int flat_get_apic_id(unsigned long x)
{
return (x >> 24) & 0xFF;
@@ -173,9 +137,9 @@ static struct apic apic_flat __ro_after_init = {
.send_IPI = default_send_IPI_single,
.send_IPI_mask = flat_send_IPI_mask,
.send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
- .send_IPI_allbutself = flat_send_IPI_allbutself,
- .send_IPI_all = flat_send_IPI_all,
- .send_IPI_self = apic_send_IPI_self,
+ .send_IPI_allbutself = default_send_IPI_allbutself,
+ .send_IPI_all = default_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
.inquire_remote_apic = default_inquire_remote_apic,
@@ -225,16 +189,6 @@ static void physflat_init_apic_ldr(void)
*/
}
-static void physflat_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
-}
-
-static void physflat_send_IPI_all(int vector)
-{
- default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
-}
-
static int physflat_probe(void)
{
if (apic == &apic_physflat || num_possible_cpus() > 8 ||
@@ -276,9 +230,9 @@ static struct apic apic_physflat __ro_after_init = {
.send_IPI = default_send_IPI_single_phys,
.send_IPI_mask = default_send_IPI_mask_sequence_phys,
.send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys,
- .send_IPI_allbutself = physflat_send_IPI_allbutself,
- .send_IPI_all = physflat_send_IPI_all,
- .send_IPI_self = apic_send_IPI_self,
+ .send_IPI_allbutself = default_send_IPI_allbutself,
+ .send_IPI_all = default_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
.inquire_remote_apic = default_inquire_remote_apic,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 5078b5ce63a7..98c9bb75d185 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -9,25 +9,9 @@
* to not uglify the caller's code and allow to call (some) apic routines
* like self-ipi, etc...
*/
-
-#include <linux/threads.h>
#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/errno.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/setup.h>
-#include <linux/smp.h>
-#include <asm/ipi.h>
-
-#include <linux/interrupt.h>
-#include <asm/acpi.h>
-#include <asm/e820/api.h>
+#include <asm/apic.h>
static void noop_init_apic_ldr(void) { }
static void noop_send_IPI(int cpu, int vector) { }
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index a5464b8b6c46..cdf45b4700f2 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -10,15 +10,15 @@
* Send feedback to <support@numascale.com>
*
*/
-
+#include <linux/types.h>
#include <linux/init.h>
#include <asm/numachip/numachip.h>
#include <asm/numachip/numachip_csr.h>
-#include <asm/ipi.h>
-#include <asm/apic_flat_64.h>
+
#include <asm/pgtable.h>
-#include <asm/pci_x86.h>
+
+#include "local.h"
u8 numachip_system __read_mostly;
static const struct apic apic_numachip1;
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index afee386ff711..38b5b51d42f6 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -4,18 +4,13 @@
*
* Drives the local APIC in "clustered mode".
*/
-#include <linux/threads.h>
#include <linux/cpumask.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
#include <linux/dmi.h>
#include <linux/smp.h>
-#include <asm/apicdef.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
#include <asm/apic.h>
-#include <asm/ipi.h>
+
+#include "local.h"
static unsigned bigsmp_get_apic_id(unsigned long x)
{
@@ -38,32 +33,12 @@ static int bigsmp_early_logical_apicid(int cpu)
return early_per_cpu(x86_cpu_to_apicid, cpu);
}
-static inline unsigned long calculate_ldr(int cpu)
-{
- unsigned long val, id;
-
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- id = per_cpu(x86_bios_cpu_apicid, cpu);
- val |= SET_APIC_LOGICAL_ID(id);
-
- return val;
-}
-
/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
+ * bigsmp enables physical destination mode
+ * and doesn't use LDR and DFR
*/
static void bigsmp_init_apic_ldr(void)
{
- unsigned long val;
- int cpu = smp_processor_id();
-
- apic_write(APIC_DFR, APIC_DFR_FLAT);
- val = calculate_ldr(cpu);
- apic_write(APIC_LDR, val);
}
static void bigsmp_setup_apic_routing(void)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index c7bb6c69f21c..913c88617848 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1725,19 +1725,20 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
return false;
}
-static inline bool ioapic_irqd_mask(struct irq_data *data)
+static inline bool ioapic_prepare_move(struct irq_data *data)
{
- /* If we are moving the irq we need to mask it */
+ /* If we are moving the IRQ we need to mask it */
if (unlikely(irqd_is_setaffinity_pending(data))) {
- mask_ioapic_irq(data);
+ if (!irqd_irq_masked(data))
+ mask_ioapic_irq(data);
return true;
}
return false;
}
-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
{
- if (unlikely(masked)) {
+ if (unlikely(moveit)) {
/* Only migrate the irq if the ack has been received.
*
* On rare occasions the broadcast level triggered ack gets
@@ -1766,15 +1767,17 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
*/
if (!io_apic_level_ack_pending(data->chip_data))
irq_move_masked_irq(data);
- unmask_ioapic_irq(data);
+ /* If the IRQ is masked in the core, leave it: */
+ if (!irqd_irq_masked(data))
+ unmask_ioapic_irq(data);
}
}
#else
-static inline bool ioapic_irqd_mask(struct irq_data *data)
+static inline bool ioapic_prepare_move(struct irq_data *data)
{
return false;
}
-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
{
}
#endif
@@ -1783,11 +1786,11 @@ static void ioapic_ack_level(struct irq_data *irq_data)
{
struct irq_cfg *cfg = irqd_cfg(irq_data);
unsigned long v;
- bool masked;
+ bool moveit;
int i;
irq_complete_move(cfg);
- masked = ioapic_irqd_mask(irq_data);
+ moveit = ioapic_prepare_move(irq_data);
/*
* It appears there is an erratum which affects at least version 0x11
@@ -1842,7 +1845,7 @@ static void ioapic_ack_level(struct irq_data *irq_data)
eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
}
- ioapic_irqd_unmask(irq_data, masked);
+ ioapic_finish_move(irq_data, moveit);
}
static void ioapic_ir_ack_level(struct irq_data *irq_data)
@@ -2438,7 +2441,13 @@ unsigned int arch_dynirq_lower_bound(unsigned int from)
* dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use
* gsi_top if ioapic_dynirq_base hasn't been initialized yet.
*/
- return ioapic_initialized ? ioapic_dynirq_base : gsi_top;
+ if (!ioapic_initialized)
+ return gsi_top;
+ /*
+ * For DT enabled machines ioapic_dynirq_base is irrelevant and not
+ * updated. So simply return @from if ioapic_dynirq_base == 0.
+ */
+ return ioapic_dynirq_base ? : from;
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 82f9244fe61f..6ca0f91372fd 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -1,24 +1,113 @@
// SPDX-License-Identifier: GPL-2.0
+
#include <linux/cpumask.h>
-#include <linux/interrupt.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/cache.h>
-#include <linux/cpu.h>
-
-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/apic.h>
-#include <asm/proto.h>
-#include <asm/ipi.h>
-
-void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
+#include <linux/smp.h>
+
+#include "local.h"
+
+DEFINE_STATIC_KEY_FALSE(apic_use_ipi_shorthand);
+
+#ifdef CONFIG_SMP
+static int apic_ipi_shorthand_off __ro_after_init;
+
+static __init int apic_ipi_shorthand(char *str)
+{
+ get_option(&str, &apic_ipi_shorthand_off);
+ return 1;
+}
+__setup("no_ipi_broadcast=", apic_ipi_shorthand);
+
+static int __init print_ipi_mode(void)
+{
+ pr_info("IPI shorthand broadcast: %s\n",
+ apic_ipi_shorthand_off ? "disabled" : "enabled");
+ return 0;
+}
+late_initcall(print_ipi_mode);
+
+void apic_smt_update(void)
+{
+ /*
+ * Do not switch to broadcast mode if:
+ * - Disabled on the command line
+ * - Only a single CPU is online
+ * - Not all present CPUs have been at least booted once
+ *
+ * The latter is important as the local APIC might be in some
+ * random state and a broadcast might cause havoc. That's
+ * especially true for NMI broadcasting.
+ */
+ if (apic_ipi_shorthand_off || num_online_cpus() == 1 ||
+ !cpumask_equal(cpu_present_mask, &cpus_booted_once_mask)) {
+ static_branch_disable(&apic_use_ipi_shorthand);
+ } else {
+ static_branch_enable(&apic_use_ipi_shorthand);
+ }
+}
+
+void apic_send_IPI_allbutself(unsigned int vector)
+{
+ if (num_online_cpus() < 2)
+ return;
+
+ if (static_branch_likely(&apic_use_ipi_shorthand))
+ apic->send_IPI_allbutself(vector);
+ else
+ apic->send_IPI_mask_allbutself(cpu_online_mask, vector);
+}
+
+/*
+ * Send a 'reschedule' IPI to another CPU. It goes straight through and
+ * wastes no time serializing anything. Worst case is that we lose a
+ * reschedule ...
+ */
+void native_smp_send_reschedule(int cpu)
+{
+ if (unlikely(cpu_is_offline(cpu))) {
+ WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu);
+ return;
+ }
+ apic->send_IPI(cpu, RESCHEDULE_VECTOR);
+}
+
+void native_send_call_func_single_ipi(int cpu)
+{
+ apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR);
+}
+
+void native_send_call_func_ipi(const struct cpumask *mask)
+{
+ if (static_branch_likely(&apic_use_ipi_shorthand)) {
+ unsigned int cpu = smp_processor_id();
+
+ if (!cpumask_or_equal(mask, cpumask_of(cpu), cpu_online_mask))
+ goto sendmask;
+
+ if (cpumask_test_cpu(cpu, mask))
+ apic->send_IPI_all(CALL_FUNCTION_VECTOR);
+ else if (num_online_cpus() > 1)
+ apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+ return;
+ }
+
+sendmask:
+ apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+}
+
+#endif /* CONFIG_SMP */
+
+static inline int __prepare_ICR2(unsigned int mask)
+{
+ return SET_APIC_DEST_FIELD(mask);
+}
+
+static inline void __xapic_wait_icr_idle(void)
+{
+ while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY)
+ cpu_relax();
+}
+
+void __default_send_IPI_shortcut(unsigned int shortcut, int vector)
{
/*
* Subtle. In the case of the 'never do double writes' workaround
@@ -32,12 +121,16 @@ void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int
/*
* Wait for idle.
*/
- __xapic_wait_icr_idle();
+ if (unlikely(vector == NMI_VECTOR))
+ safe_apic_wait_icr_idle();
+ else
+ __xapic_wait_icr_idle();
/*
- * No need to touch the target chip field
+ * No need to touch the target chip field. Also the destination
+ * mode is ignored when a shorthand is used.
*/
- cfg = __prepare_ICR(shortcut, vector, dest);
+ cfg = __prepare_ICR(shortcut, vector, 0);
/*
* Send the IPI. The write to APIC_ICR fires this off.
@@ -133,6 +226,21 @@ void default_send_IPI_single(int cpu, int vector)
apic->send_IPI_mask(cpumask_of(cpu), vector);
}
+void default_send_IPI_allbutself(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
+}
+
+void default_send_IPI_all(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector);
+}
+
+void default_send_IPI_self(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector);
+}
+
#ifdef CONFIG_X86_32
void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
@@ -192,28 +300,6 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
local_irq_restore(flags);
}
-void default_send_IPI_allbutself(int vector)
-{
- /*
- * if there are no other CPUs in the system then we get an APIC send
- * error if we try to broadcast, thus avoid sending IPIs in this case.
- */
- if (!(num_online_cpus() > 1))
- return;
-
- __default_local_send_IPI_allbutself(vector);
-}
-
-void default_send_IPI_all(int vector)
-{
- __default_local_send_IPI_all(vector);
-}
-
-void default_send_IPI_self(int vector)
-{
- __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical);
-}
-
/* must come after the send_IPI functions above for inlining */
static int convert_apicid_to_cpu(int apic_id)
{
diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h
new file mode 100644
index 000000000000..04797f05ce94
--- /dev/null
+++ b/arch/x86/kernel/apic/local.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Historical copyright notices:
+ *
+ * Copyright 2004 James Cleverdon, IBM.
+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/jump_label.h>
+
+#include <asm/apic.h>
+
+/* APIC flat 64 */
+void flat_init_apic_ldr(void);
+
+/* X2APIC */
+int x2apic_apic_id_valid(u32 apicid);
+int x2apic_apic_id_registered(void);
+void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest);
+unsigned int x2apic_get_apic_id(unsigned long id);
+u32 x2apic_set_apic_id(unsigned int id);
+int x2apic_phys_pkg_id(int initial_apicid, int index_msb);
+void x2apic_send_IPI_self(int vector);
+void __x2apic_send_IPI_shorthand(int vector, u32 which);
+
+/* IPI */
+
+DECLARE_STATIC_KEY_FALSE(apic_use_ipi_shorthand);
+
+static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector,
+ unsigned int dest)
+{
+ unsigned int icr = shortcut | dest;
+
+ switch (vector) {
+ default:
+ icr |= APIC_DM_FIXED | vector;
+ break;
+ case NMI_VECTOR:
+ icr |= APIC_DM_NMI;
+ break;
+ }
+ return icr;
+}
+
+void __default_send_IPI_shortcut(unsigned int shortcut, int vector);
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest);
+
+void default_send_IPI_single(int cpu, int vector);
+void default_send_IPI_single_phys(int cpu, int vector);
+void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector);
+void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector);
+void default_send_IPI_allbutself(int vector);
+void default_send_IPI_all(int vector);
+void default_send_IPI_self(int vector);
+
+#ifdef CONFIG_X86_32
+void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector);
+void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector);
+void default_send_IPI_mask_logical(const struct cpumask *mask, int vector);
+#endif
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 7f7533462474..159bd0cb8548 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -23,10 +23,8 @@
static struct irq_domain *msi_default_domain;
-static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
{
- struct irq_cfg *cfg = irqd_cfg(data);
-
msg->address_hi = MSI_ADDR_BASE_HI;
if (x2apic_enabled())
@@ -47,6 +45,127 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
MSI_DATA_VECTOR(cfg->vector);
}
+static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg);
+}
+
+static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
+{
+ struct msi_msg msg[2] = { [1] = { }, };
+
+ __irq_msi_compose_msg(cfg, msg);
+ irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
+}
+
+static int
+msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
+{
+ struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd);
+ struct irq_data *parent = irqd->parent_data;
+ unsigned int cpu;
+ int ret;
+
+ /* Save the current configuration */
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
+ old_cfg = *cfg;
+
+ /* Allocate a new target vector */
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;
+
+ /*
+ * For non-maskable and non-remapped MSI interrupts the migration
+ * to a different destination CPU and a different vector has to be
+ * done careful to handle the possible stray interrupt which can be
+ * caused by the non-atomic update of the address/data pair.
+ *
+ * Direct update is possible when:
+ * - The MSI is maskable (remapped MSI does not use this code path)).
+ * The quirk bit is not set in this case.
+ * - The new vector is the same as the old vector
+ * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
+ * - The new destination CPU is the same as the old destination CPU
+ */
+ if (!irqd_msi_nomask_quirk(irqd) ||
+ cfg->vector == old_cfg.vector ||
+ old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
+ cfg->dest_apicid == old_cfg.dest_apicid) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Paranoia: Validate that the interrupt target is the local
+ * CPU.
+ */
+ if (WARN_ON_ONCE(cpu != smp_processor_id())) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Redirect the interrupt to the new vector on the current CPU
+ * first. This might cause a spurious interrupt on this vector if
+ * the device raises an interrupt right between this update and the
+ * update to the final destination CPU.
+ *
+ * If the vector is in use then the installed device handler will
+ * denote it as spurious which is no harm as this is a rare event
+ * and interrupt handlers have to cope with spurious interrupts
+ * anyway. If the vector is unused, then it is marked so it won't
+ * trigger the 'No irq handler for vector' warning in do_IRQ().
+ *
+ * This requires to hold vector lock to prevent concurrent updates to
+ * the affected vector.
+ */
+ lock_vector_lock();
+
+ /*
+ * Mark the new target vector on the local CPU if it is currently
+ * unused. Reuse the VECTOR_RETRIGGERED state which is also used in
+ * the CPU hotplug path for a similar purpose. This cannot be
+ * undone here as the current CPU has interrupts disabled and
+ * cannot handle the interrupt before the whole set_affinity()
+ * section is done. In the CPU unplug case, the current CPU is
+ * about to vanish and will not handle any interrupts anymore. The
+ * vector is cleaned up when the CPU comes online again.
+ */
+ if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector])))
+ this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED);
+
+ /* Redirect it to the new vector on the local CPU temporarily */
+ old_cfg.vector = cfg->vector;
+ irq_msi_update_msg(irqd, &old_cfg);
+
+ /* Now transition it to the target CPU */
+ irq_msi_update_msg(irqd, cfg);
+
+ /*
+ * All interrupts after this point are now targeted at the new
+ * vector/CPU.
+ *
+ * Drop vector lock before testing whether the temporary assignment
+ * to the local CPU was hit by an interrupt raised in the device,
+ * because the retrigger function acquires vector lock again.
+ */
+ unlock_vector_lock();
+
+ /*
+ * Check whether the transition raced with a device interrupt and
+ * is pending in the local APICs IRR. It is safe to do this outside
+ * of vector lock as the irq_desc::lock of this interrupt is still
+ * held and interrupts are disabled: The check is not accessing the
+ * underlying vector store. It's just checking the local APIC's
+ * IRR.
+ */
+ if (lapic_vector_set_in_irr(cfg->vector))
+ irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
+
+ return ret;
+}
+
/*
* IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
* which implement the MSI or MSI-X Capability Structure.
@@ -58,6 +177,7 @@ static struct irq_chip pci_msi_controller = {
.irq_ack = irq_chip_ack_parent,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_compose_msi_msg = irq_msi_compose_msg,
+ .irq_set_affinity = msi_set_affinity,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -146,6 +266,8 @@ void __init arch_init_msi_domain(struct irq_domain *parent)
}
if (!msi_default_domain)
pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
+ else
+ msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
}
#ifdef CONFIG_IRQ_REMAP
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index ee2d91e382f1..67b33d67002f 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -6,51 +6,14 @@
*
* Generic x86 APIC driver probe layer.
*/
-#include <linux/threads.h>
-#include <linux/cpumask.h>
#include <linux/export.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
#include <linux/errno.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/setup.h>
-
#include <linux/smp.h>
-#include <asm/ipi.h>
-#include <linux/interrupt.h>
+#include <asm/apic.h>
#include <asm/acpi.h>
-#include <asm/e820/api.h>
-#ifdef CONFIG_HOTPLUG_CPU
-#define DEFAULT_SEND_IPI (1)
-#else
-#define DEFAULT_SEND_IPI (0)
-#endif
-
-int no_broadcast = DEFAULT_SEND_IPI;
-
-static __init int no_ipi_broadcast(char *str)
-{
- get_option(&str, &no_broadcast);
- pr_info("Using %s mode\n",
- no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
- return 1;
-}
-__setup("no_ipi_broadcast=", no_ipi_broadcast);
-
-static int __init print_ipi_mode(void)
-{
- pr_info("Using IPI %s mode\n",
- no_broadcast ? "No-Shortcut" : "Shortcut");
- return 0;
-}
-late_initcall(print_ipi_mode);
+#include "local.h"
static int default_x86_32_early_logical_apicid(int cpu)
{
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index e6560a02eb46..29f0e0984557 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -8,19 +8,9 @@
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/hardirq.h>
-#include <linux/dmar.h>
-
-#include <asm/smp.h>
#include <asm/apic.h>
-#include <asm/ipi.h>
-#include <asm/setup.h>
+
+#include "local.h"
/*
* Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
@@ -46,13 +36,6 @@ void __init default_setup_apic_routing(void)
x86_platform.apic_post_init();
}
-/* Same for both flat and physical. */
-
-void apic_send_IPI_self(int vector)
-{
- __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
-}
-
int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
struct apic **drv;
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index fdacb864c3dd..2c5676b0a6e7 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -398,6 +398,17 @@ static int activate_reserved(struct irq_data *irqd)
if (!irqd_can_reserve(irqd))
apicd->can_reserve = false;
}
+
+ /*
+ * Check to ensure that the effective affinity mask is a subset
+ * the user supplied affinity mask, and warn the user if it is not
+ */
+ if (!cpumask_subset(irq_data_get_effective_affinity_mask(irqd),
+ irq_data_get_affinity_mask(irqd))) {
+ pr_warn("irq %u: Affinity broken due to vector space exhaustion.\n",
+ irqd->irq);
+ }
+
return ret;
}
diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h
deleted file mode 100644
index a49b3604027f..000000000000
--- a/arch/x86/kernel/apic/x2apic.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/* Common bits for X2APIC cluster/physical modes. */
-
-int x2apic_apic_id_valid(u32 apicid);
-int x2apic_apic_id_registered(void);
-void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest);
-unsigned int x2apic_get_apic_id(unsigned long id);
-u32 x2apic_set_apic_id(unsigned int id);
-int x2apic_phys_pkg_id(int initial_apicid, int index_msb);
-void x2apic_send_IPI_self(int vector);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 609e499387a1..b0889c48a2ac 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -1,15 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/threads.h>
+
+#include <linux/cpuhotplug.h>
#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/dmar.h>
-#include <linux/irq.h>
-#include <linux/cpu.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+#include <asm/apic.h>
-#include <asm/smp.h>
-#include "x2apic.h"
+#include "local.h"
struct cluster_mask {
unsigned int clusterid;
@@ -84,12 +82,12 @@ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
static void x2apic_send_IPI_allbutself(int vector)
{
- __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
+ __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_all(int vector)
{
- __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
+ __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC);
}
static u32 x2apic_calc_apicid(unsigned int cpu)
@@ -158,7 +156,8 @@ static int x2apic_dead_cpu(unsigned int dead_cpu)
{
struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu);
- cpumask_clear_cpu(dead_cpu, &cmsk->mask);
+ if (cmsk)
+ cpumask_clear_cpu(dead_cpu, &cmsk->mask);
free_cpumask_var(per_cpu(ipi_mask, dead_cpu));
return 0;
}
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index b5cf9e7b3830..bc9693841353 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -1,14 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/threads.h>
+
#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/dmar.h>
+#include <linux/acpi.h>
-#include <asm/smp.h>
-#include <asm/ipi.h>
-#include "x2apic.h"
+#include "local.h"
int x2apic_phys;
@@ -80,12 +75,12 @@ static void
static void x2apic_send_IPI_allbutself(int vector)
{
- __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
+ __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_all(int vector)
{
- __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
+ __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC);
}
static void init_x2apic_ldr(void)
@@ -117,6 +112,14 @@ void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
native_x2apic_icr_write(cfg, apicid);
}
+void __x2apic_send_IPI_shorthand(int vector, u32 which)
+{
+ unsigned long cfg = __prepare_ICR(which, vector, 0);
+
+ x2apic_wrmsr_fence();
+ native_x2apic_icr_write(cfg, 0);
+}
+
unsigned int x2apic_get_apic_id(unsigned long id)
{
return id;
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 1e225528f0d7..ad53b2abc859 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -7,50 +7,37 @@
*
* Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved.
*/
+#include <linux/crash_dump.h>
+#include <linux/cpuhotplug.h>
#include <linux/cpumask.h>
-#include <linux/hardirq.h>
#include <linux/proc_fs.h>
-#include <linux/threads.h>
-#include <linux/kernel.h>
+#include <linux/memory.h>
#include <linux/export.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/init.h>
-#include <linux/io.h>
#include <linux/pci.h>
-#include <linux/kdebug.h>
-#include <linux/delay.h>
-#include <linux/crash_dump.h>
-#include <linux/reboot.h>
-#include <linux/memory.h>
-#include <linux/numa.h>
+#include <linux/acpi.h>
+#include <linux/efi.h>
+#include <asm/e820/api.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
-#include <asm/current.h>
-#include <asm/pgtable.h>
#include <asm/uv/bios.h>
#include <asm/uv/uv.h>
#include <asm/apic.h>
-#include <asm/e820/api.h>
-#include <asm/ipi.h>
-#include <asm/smp.h>
-#include <asm/x86_init.h>
-#include <asm/nmi.h>
-DEFINE_PER_CPU(int, x2apic_extra_bits);
+static DEFINE_PER_CPU(int, x2apic_extra_bits);
static enum uv_system_type uv_system_type;
-static bool uv_hubless_system;
+static int uv_hubbed_system;
+static int uv_hubless_system;
static u64 gru_start_paddr, gru_end_paddr;
static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr;
static u64 gru_dist_lmask, gru_dist_umask;
static union uvh_apicid uvh_apicid;
+/* Unpack OEM/TABLE ID's to be NULL terminated strings */
+static u8 oem_id[ACPI_OEM_ID_SIZE + 1];
+static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
+
/* Information derived from CPUID: */
static struct {
unsigned int apicid_shift;
@@ -268,17 +255,35 @@ static void __init uv_set_apicid_hibit(void)
}
}
-static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static void __init uv_stringify(int len, char *to, char *from)
+{
+ /* Relies on 'to' being NULL chars so result will be NULL terminated */
+ strncpy(to, from, len-1);
+}
+
+static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id)
{
int pnodeid;
int uv_apic;
+ uv_stringify(sizeof(oem_id), oem_id, _oem_id);
+ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);
+
if (strncmp(oem_id, "SGI", 3) != 0) {
- if (strncmp(oem_id, "NSGI", 4) == 0) {
- uv_hubless_system = true;
- pr_info("UV: OEM IDs %s/%s, HUBLESS\n",
- oem_id, oem_table_id);
- }
+ if (strncmp(oem_id, "NSGI", 4) != 0)
+ return 0;
+
+ /* UV4 Hubless, CH, (0x11:UV4+Any) */
+ if (strncmp(oem_id, "NSGI4", 5) == 0)
+ uv_hubless_system = 0x11;
+
+ /* UV3 Hubless, UV300/MC990X w/o hub (0x9:UV3+Any) */
+ else
+ uv_hubless_system = 0x9;
+
+ pr_info("UV: OEM IDs %s/%s, HUBLESS(0x%x)\n",
+ oem_id, oem_table_id, uv_hubless_system);
+
return 0;
}
@@ -306,6 +311,24 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
if (uv_hub_info->hub_revision == 0)
goto badbios;
+ switch (uv_hub_info->hub_revision) {
+ case UV4_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x11;
+ break;
+
+ case UV3_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x9;
+ break;
+
+ case UV2_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x5;
+ break;
+
+ case UV1_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x3;
+ break;
+ }
+
pnodeid = early_get_pnodeid();
early_get_apic_socketid_shift();
@@ -356,9 +379,15 @@ int is_uv_system(void)
}
EXPORT_SYMBOL_GPL(is_uv_system);
-int is_uv_hubless(void)
+int is_uv_hubbed(int uvtype)
+{
+ return (uv_hubbed_system & uvtype);
+}
+EXPORT_SYMBOL_GPL(is_uv_hubbed);
+
+int is_uv_hubless(int uvtype)
{
- return uv_hubless_system;
+ return (uv_hubless_system & uvtype);
}
EXPORT_SYMBOL_GPL(is_uv_hubless);
@@ -1275,7 +1304,8 @@ static int __init decode_uv_systab(void)
struct uv_systab *st;
int i;
- if (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE)
+ /* If system is uv3 or lower, there is no extended UVsystab */
+ if (is_uv_hubbed(0xfffffe) < uv(4) && is_uv_hubless(0xfffffe) < uv(4))
return 0; /* No extended UVsystab required */
st = uv_systab;
@@ -1454,6 +1484,72 @@ static void __init build_socket_tables(void)
}
}
+/* Check which reboot to use */
+static void check_efi_reboot(void)
+{
+ /* If EFI reboot not available, use ACPI reboot */
+ if (!efi_enabled(EFI_BOOT))
+ reboot_type = BOOT_ACPI;
+}
+
+/* Setup user proc fs files */
+static int __maybe_unused proc_hubbed_show(struct seq_file *file, void *data)
+{
+ seq_printf(file, "0x%x\n", uv_hubbed_system);
+ return 0;
+}
+
+static int __maybe_unused proc_hubless_show(struct seq_file *file, void *data)
+{
+ seq_printf(file, "0x%x\n", uv_hubless_system);
+ return 0;
+}
+
+static int __maybe_unused proc_oemid_show(struct seq_file *file, void *data)
+{
+ seq_printf(file, "%s/%s\n", oem_id, oem_table_id);
+ return 0;
+}
+
+static __init void uv_setup_proc_files(int hubless)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_mkdir(UV_PROC_NODE, NULL);
+ proc_create_single("oemid", 0, pde, proc_oemid_show);
+ if (hubless)
+ proc_create_single("hubless", 0, pde, proc_hubless_show);
+ else
+ proc_create_single("hubbed", 0, pde, proc_hubbed_show);
+}
+
+/* Initialize UV hubless systems */
+static __init int uv_system_init_hubless(void)
+{
+ int rc;
+
+ /* Setup PCH NMI handler */
+ uv_nmi_setup_hubless();
+
+ /* Init kernel/BIOS interface */
+ rc = uv_bios_init();
+ if (rc < 0)
+ return rc;
+
+ /* Process UVsystab */
+ rc = decode_uv_systab();
+ if (rc < 0)
+ return rc;
+
+ /* Create user access node */
+ if (rc >= 0)
+ uv_setup_proc_files(1);
+
+ check_efi_reboot();
+
+ return rc;
+}
+
static void __init uv_system_init_hub(void)
{
struct uv_hub_info_s hub_info = {0};
@@ -1579,32 +1675,27 @@ static void __init uv_system_init_hub(void)
uv_nmi_setup();
uv_cpu_init();
uv_scir_register_cpu_notifier();
- proc_mkdir("sgi_uv", NULL);
+ uv_setup_proc_files(0);
/* Register Legacy VGA I/O redirection handler: */
pci_register_set_vga_state(uv_set_vga_state);
- /*
- * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as
- * EFI is not enabled in the kdump kernel:
- */
- if (is_kdump_kernel())
- reboot_type = BOOT_ACPI;
+ check_efi_reboot();
}
/*
- * There is a small amount of UV specific code needed to initialize a
- * UV system that does not have a "UV HUB" (referred to as "hubless").
+ * There is a different code path needed to initialize a UV system that does
+ * not have a "UV HUB" (referred to as "hubless").
*/
void __init uv_system_init(void)
{
- if (likely(!is_uv_system() && !is_uv_hubless()))
+ if (likely(!is_uv_system() && !is_uv_hubless(1)))
return;
if (is_uv_system())
uv_system_init_hub();
else
- uv_nmi_setup_hubless();
+ uv_system_init_hubless();
}
apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index d3d075226c0a..24d2fde30d00 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -6,13 +6,28 @@
#include <asm/ia32.h>
#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
+#define __SYSCALL_X32(nr, sym, qual)
static char syscalls_64[] = {
#include <asm/syscalls_64.h>
};
+#undef __SYSCALL_64
+#undef __SYSCALL_X32
+
+#ifdef CONFIG_X86_X32_ABI
+#define __SYSCALL_64(nr, sym, qual)
+#define __SYSCALL_X32(nr, sym, qual) [nr] = 1,
+static char syscalls_x32[] = {
+#include <asm/syscalls_64.h>
+};
+#undef __SYSCALL_64
+#undef __SYSCALL_X32
+#endif
+
#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
static char syscalls_ia32[] = {
#include <asm/syscalls_32.h>
};
+#undef __SYSCALL_I386
#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
#include <asm/kvm_para.h>
@@ -62,7 +77,6 @@ int main(void)
ENTRY(cr2);
ENTRY(cr3);
ENTRY(cr4);
- ENTRY(cr8);
ENTRY(gdt_desc);
BLANK();
#undef ENTRY
@@ -80,6 +94,11 @@ int main(void)
DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
DEFINE(NR_syscalls, sizeof(syscalls_64));
+#ifdef CONFIG_X86_X32_ABI
+ DEFINE(__NR_syscall_x32_max, sizeof(syscalls_x32) - 1);
+ DEFINE(X32_NR_syscalls, sizeof(syscalls_x32));
+#endif
+
DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1);
DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d7a1e5a9331c..7dc4ad68eb41 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -29,8 +29,9 @@ obj-y += umwait.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
+obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o
ifdef CONFIG_CPU_SUP_INTEL
-obj-y += intel.o intel_pconfig.o
+obj-y += intel.o intel_pconfig.o tsx.o
obj-$(CONFIG_PM) += intel_epb.o
endif
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
@@ -53,11 +54,12 @@ obj-$(CONFIG_ACRN_GUEST) += acrn.o
ifdef CONFIG_X86_FEATURE_NAMES
quiet_cmd_mkcapflags = MKCAP $@
- cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
+ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^
cpufeature = $(src)/../../include/asm/cpufeatures.h
+vmxfeature = $(src)/../../include/asm/vmxfeatures.h
-$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
+$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
endif
targets += capflags.c
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 8d4e50428b68..ac83a0fef628 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -8,6 +8,7 @@
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/random.h>
+#include <linux/topology.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cacheinfo.h>
@@ -318,13 +319,6 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
c->cpu_core_id %= cus_per_node;
}
-
-static void amd_get_topology_early(struct cpuinfo_x86 *c)
-{
- if (cpu_has(c, X86_FEATURE_TOPOEXT))
- smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
-}
-
/*
* Fixup core topology information for
* (1) AMD multi-node processors
@@ -614,9 +608,9 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
return;
clear_all:
- clear_cpu_cap(c, X86_FEATURE_SME);
+ setup_clear_cpu_cap(X86_FEATURE_SME);
clear_sev:
- clear_cpu_cap(c, X86_FEATURE_SEV);
+ setup_clear_cpu_cap(X86_FEATURE_SEV);
}
}
@@ -716,7 +710,8 @@ static void early_init_amd(struct cpuinfo_x86 *c)
}
}
- amd_get_topology_early(c);
+ if (cpu_has(c, X86_FEATURE_TOPOEXT))
+ smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
}
static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -804,6 +799,64 @@ static void init_amd_ln(struct cpuinfo_x86 *c)
msr_set_bit(MSR_AMD64_DE_CFG, 31);
}
+static bool rdrand_force;
+
+static int __init rdrand_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "force"))
+ rdrand_force = true;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("rdrand", rdrand_cmdline);
+
+static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c)
+{
+ /*
+ * Saving of the MSR used to hide the RDRAND support during
+ * suspend/resume is done by arch/x86/power/cpu.c, which is
+ * dependent on CONFIG_PM_SLEEP.
+ */
+ if (!IS_ENABLED(CONFIG_PM_SLEEP))
+ return;
+
+ /*
+ * The nordrand option can clear X86_FEATURE_RDRAND, so check for
+ * RDRAND support using the CPUID function directly.
+ */
+ if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force)
+ return;
+
+ msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62);
+
+ /*
+ * Verify that the CPUID change has occurred in case the kernel is
+ * running virtualized and the hypervisor doesn't support the MSR.
+ */
+ if (cpuid_ecx(1) & BIT(30)) {
+ pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n");
+ return;
+ }
+
+ clear_cpu_cap(c, X86_FEATURE_RDRAND);
+ pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n");
+}
+
+static void init_amd_jg(struct cpuinfo_x86 *c)
+{
+ /*
+ * Some BIOS implementations do not restore proper RDRAND support
+ * across suspend and resume. Check on whether to hide the RDRAND
+ * instruction support via CPUID.
+ */
+ clear_rdrand_cpuid_bit(c);
+}
+
static void init_amd_bd(struct cpuinfo_x86 *c)
{
u64 value;
@@ -818,12 +871,23 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
wrmsrl_safe(MSR_F15H_IC_CFG, value);
}
}
+
+ /*
+ * Some BIOS implementations do not restore proper RDRAND support
+ * across suspend and resume. Check on whether to hide the RDRAND
+ * instruction support via CPUID.
+ */
+ clear_rdrand_cpuid_bit(c);
}
static void init_amd_zn(struct cpuinfo_x86 *c)
{
set_cpu_cap(c, X86_FEATURE_ZEN);
+#ifdef CONFIG_NUMA
+ node_reclaim_distance = 32;
+#endif
+
/*
* Fix erratum 1076: CPB feature bit not being set in CPUID.
* Always set it, except when running under a hypervisor.
@@ -860,6 +924,7 @@ static void init_amd(struct cpuinfo_x86 *c)
case 0x10: init_amd_gh(c); break;
case 0x12: init_amd_ln(c); break;
case 0x15: init_amd_bd(c); break;
+ case 0x16: init_amd_jg(c); break;
case 0x17: init_amd_zn(c); break;
}
@@ -879,12 +944,8 @@ static void init_amd(struct cpuinfo_x86 *c)
init_amd_cacheinfo(c);
if (cpu_has(c, X86_FEATURE_XMM2)) {
- unsigned long long val;
- int ret;
-
/*
- * A serializing LFENCE has less overhead than MFENCE, so
- * use it for execution serialization. On families which
+ * Use LFENCE for execution serialization. On families which
* don't have that MSR, LFENCE is already serializing.
* msr_set_bit() uses the safe accessors, too, even if the MSR
* is not present.
@@ -892,19 +953,8 @@ static void init_amd(struct cpuinfo_x86 *c)
msr_set_bit(MSR_F10H_DECFG,
MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
- /*
- * Verify that the MSR write was successful (could be running
- * under a hypervisor) and only then assume that LFENCE is
- * serializing.
- */
- ret = rdmsrl_safe(MSR_F10H_DECFG, &val);
- if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) {
- /* A serializing LFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
- } else {
- /* MFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
- }
+ /* A serializing LFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
}
/*
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c6fa3ef10b4e..ed54b3b21c39 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -39,6 +39,8 @@ static void __init spectre_v2_select_mitigation(void);
static void __init ssb_select_mitigation(void);
static void __init l1tf_select_mitigation(void);
static void __init mds_select_mitigation(void);
+static void __init mds_print_mitigation(void);
+static void __init taa_select_mitigation(void);
/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
u64 x86_spec_ctrl_base;
@@ -105,6 +107,13 @@ void __init check_bugs(void)
ssb_select_mitigation();
l1tf_select_mitigation();
mds_select_mitigation();
+ taa_select_mitigation();
+
+ /*
+ * As MDS and TAA mitigations are inter-related, print MDS
+ * mitigation until after TAA mitigation selection is done.
+ */
+ mds_print_mitigation();
arch_smt_update();
@@ -243,6 +252,12 @@ static void __init mds_select_mitigation(void)
(mds_nosmt || cpu_mitigations_auto_nosmt()))
cpu_smt_disable(false);
}
+}
+
+static void __init mds_print_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off())
+ return;
pr_info("%s\n", mds_strings[mds_mitigation]);
}
@@ -269,6 +284,120 @@ static int __init mds_cmdline(char *str)
early_param("mds", mds_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "TAA: " fmt
+
+enum taa_mitigations {
+ TAA_MITIGATION_OFF,
+ TAA_MITIGATION_UCODE_NEEDED,
+ TAA_MITIGATION_VERW,
+ TAA_MITIGATION_TSX_DISABLED,
+};
+
+/* Default mitigation for TAA-affected CPUs */
+static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;
+static bool taa_nosmt __ro_after_init;
+
+static const char * const taa_strings[] = {
+ [TAA_MITIGATION_OFF] = "Vulnerable",
+ [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+ [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled",
+};
+
+static void __init taa_select_mitigation(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has_bug(X86_BUG_TAA)) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ return;
+ }
+
+ /* TSX previously disabled by tsx=off */
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
+ taa_mitigation = TAA_MITIGATION_TSX_DISABLED;
+ goto out;
+ }
+
+ if (cpu_mitigations_off()) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ return;
+ }
+
+ /*
+ * TAA mitigation via VERW is turned off if both
+ * tsx_async_abort=off and mds=off are specified.
+ */
+ if (taa_mitigation == TAA_MITIGATION_OFF &&
+ mds_mitigation == MDS_MITIGATION_OFF)
+ goto out;
+
+ if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ taa_mitigation = TAA_MITIGATION_VERW;
+ else
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
+ * A microcode update fixes this behavior to clear CPU buffers. It also
+ * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
+ * ARCH_CAP_TSX_CTRL_MSR bit.
+ *
+ * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
+ * update is required.
+ */
+ ia32_cap = x86_read_arch_cap_msr();
+ if ( (ia32_cap & ARCH_CAP_MDS_NO) &&
+ !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * TSX is enabled, select alternate mitigation for TAA which is
+ * the same as MDS. Enable MDS static branch to clear CPU buffers.
+ *
+ * For guests that can't determine whether the correct microcode is
+ * present on host, enable the mitigation for UCODE_NEEDED as well.
+ */
+ static_branch_enable(&mds_user_clear);
+
+ if (taa_nosmt || cpu_mitigations_auto_nosmt())
+ cpu_smt_disable(false);
+
+ /*
+ * Update MDS mitigation, if necessary, as the mds_user_clear is
+ * now enabled for TAA mitigation.
+ */
+ if (mds_mitigation == MDS_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_MDS)) {
+ mds_mitigation = MDS_MITIGATION_FULL;
+ mds_select_mitigation();
+ }
+out:
+ pr_info("%s\n", taa_strings[taa_mitigation]);
+}
+
+static int __init tsx_async_abort_parse_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TAA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ taa_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "Spectre V1 : " fmt
enum spectre_v1_mitigation {
@@ -786,13 +915,10 @@ static void update_mds_branch_idle(void)
}
#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
+#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
-void arch_smt_update(void)
+void cpu_bugs_smt_update(void)
{
- /* Enhanced IBRS implies STIBP. No update required. */
- if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
- return;
-
mutex_lock(&spec_ctrl_mutex);
switch (spectre_v2_user) {
@@ -819,6 +945,17 @@ void arch_smt_update(void)
break;
}
+ switch (taa_mitigation) {
+ case TAA_MITIGATION_VERW:
+ case TAA_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(TAA_MSG_SMT);
+ break;
+ case TAA_MITIGATION_TSX_DISABLED:
+ case TAA_MITIGATION_OFF:
+ break;
+ }
+
mutex_unlock(&spec_ctrl_mutex);
}
@@ -1149,6 +1286,9 @@ void x86_spec_ctrl_setup_ap(void)
x86_amd_ssb_disable();
}
+bool itlb_multihit_kvm_mitigation;
+EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
+
#undef pr_fmt
#define pr_fmt(fmt) "L1TF: " fmt
@@ -1184,15 +1324,15 @@ static void override_cache_bits(struct cpuinfo_x86 *c)
case INTEL_FAM6_WESTMERE:
case INTEL_FAM6_SANDYBRIDGE:
case INTEL_FAM6_IVYBRIDGE:
- case INTEL_FAM6_HASWELL_CORE:
- case INTEL_FAM6_HASWELL_ULT:
- case INTEL_FAM6_HASWELL_GT3E:
- case INTEL_FAM6_BROADWELL_CORE:
- case INTEL_FAM6_BROADWELL_GT3E:
- case INTEL_FAM6_SKYLAKE_MOBILE:
- case INTEL_FAM6_SKYLAKE_DESKTOP:
- case INTEL_FAM6_KABYLAKE_MOBILE:
- case INTEL_FAM6_KABYLAKE_DESKTOP:
+ case INTEL_FAM6_HASWELL:
+ case INTEL_FAM6_HASWELL_L:
+ case INTEL_FAM6_HASWELL_G:
+ case INTEL_FAM6_BROADWELL:
+ case INTEL_FAM6_BROADWELL_G:
+ case INTEL_FAM6_SKYLAKE_L:
+ case INTEL_FAM6_SKYLAKE:
+ case INTEL_FAM6_KABYLAKE_L:
+ case INTEL_FAM6_KABYLAKE:
if (c->x86_cache_bits < 44)
c->x86_cache_bits = 44;
break;
@@ -1304,11 +1444,24 @@ static ssize_t l1tf_show_state(char *buf)
l1tf_vmx_states[l1tf_vmx_mitigation],
sched_smt_active() ? "vulnerable" : "disabled");
}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ if (itlb_multihit_kvm_mitigation)
+ return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
+ else
+ return sprintf(buf, "KVM: Vulnerable\n");
+}
#else
static ssize_t l1tf_show_state(char *buf)
{
return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ return sprintf(buf, "Processor vulnerable\n");
+}
#endif
static ssize_t mds_show_state(char *buf)
@@ -1328,6 +1481,21 @@ static ssize_t mds_show_state(char *buf)
sched_smt_active() ? "vulnerable" : "disabled");
}
+static ssize_t tsx_async_abort_show_state(char *buf)
+{
+ if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) ||
+ (taa_mitigation == TAA_MITIGATION_OFF))
+ return sprintf(buf, "%s\n", taa_strings[taa_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sprintf(buf, "%s; SMT Host state unknown\n",
+ taa_strings[taa_mitigation]);
+ }
+
+ return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
static char *stibp_state(void)
{
if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
@@ -1398,6 +1566,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_MDS:
return mds_show_state(buf);
+ case X86_BUG_TAA:
+ return tsx_async_abort_show_state(buf);
+
+ case X86_BUG_ITLB_MULTIHIT:
+ return itlb_multihit_show_state(buf);
+
default:
break;
}
@@ -1434,4 +1608,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu
{
return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
}
+
+ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_TAA);
+}
+
+ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
+}
#endif
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 14433ff5b828..426792565d86 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -18,13 +18,6 @@
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
-#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
-#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
-#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
-#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
-#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
-#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
-
static void init_c3(struct cpuinfo_x86 *c)
{
u32 lo, hi;
@@ -71,8 +64,6 @@ static void init_c3(struct cpuinfo_x86 *c)
c->x86_cache_alignment = c->x86_clflush_size * 2;
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
-
- cpu_detect_cache_sizes(c);
}
enum {
@@ -119,31 +110,6 @@ static void early_init_centaur(struct cpuinfo_x86 *c)
}
}
-static void centaur_detect_vmx_virtcap(struct cpuinfo_x86 *c)
-{
- u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
-
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
- msr_ctl = vmx_msr_high | vmx_msr_low;
-
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
- set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
- set_cpu_cap(c, X86_FEATURE_VNMI);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx_msr_low, vmx_msr_high);
- msr_ctl2 = vmx_msr_high | vmx_msr_low;
- if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
- (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
- set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
- set_cpu_cap(c, X86_FEATURE_EPT);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
- set_cpu_cap(c, X86_FEATURE_VPID);
- }
-}
-
static void init_centaur(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
@@ -250,8 +216,7 @@ static void init_centaur(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
- if (cpu_has(c, X86_FEATURE_VMX))
- centaur_detect_vmx_virtcap(c);
+ init_ia32_feat_ctl(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f125bf7ecb6f..52c9bfbbdb2a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -14,6 +14,7 @@
#include <linux/sched/mm.h>
#include <linux/sched/clock.h>
#include <linux/sched/task.h>
+#include <linux/sched/smt.h>
#include <linux/init.h>
#include <linux/kprobes.h>
#include <linux/kgdb.h>
@@ -24,6 +25,7 @@
#include <asm/stackprotector.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
+#include <asm/doublefault.h>
#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
@@ -48,15 +50,12 @@
#include <asm/cpu.h>
#include <asm/mce.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/microcode.h>
#include <asm/microcode_intel.h>
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/uv/uv.h>
-#endif
#include "cpu.h"
@@ -165,22 +164,6 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
-static int __init x86_mpx_setup(char *s)
-{
- /* require an exact match without trailing characters */
- if (strlen(s))
- return 0;
-
- /* do not emit a message if the feature is not present */
- if (!boot_cpu_has(X86_FEATURE_MPX))
- return 1;
-
- setup_clear_cpu_cap(X86_FEATURE_MPX);
- pr_info("nompx: Intel Memory Protection Extensions (MPX) disabled\n");
- return 1;
-}
-__setup("nompx", x86_mpx_setup);
-
#ifdef CONFIG_X86_64
static int __init x86_nopcid_setup(char *s)
{
@@ -307,8 +290,6 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
static __init int setup_disable_smep(char *arg)
{
setup_clear_cpu_cap(X86_FEATURE_SMEP);
- /* Check for things that depend on SMEP being enabled: */
- check_mpx_erratum(&boot_cpu_data);
return 1;
}
__setup("nosmep", setup_disable_smep);
@@ -565,8 +546,9 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
return NULL; /* Not found */
}
-__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
-__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
+/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
void load_percpu_segment(int cpu)
{
@@ -1016,13 +998,15 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#endif
}
-#define NO_SPECULATION BIT(0)
-#define NO_MELTDOWN BIT(1)
-#define NO_SSB BIT(2)
-#define NO_L1TF BIT(3)
-#define NO_MDS BIT(4)
-#define MSBDS_ONLY BIT(5)
-#define NO_SWAPGS BIT(6)
+#define NO_SPECULATION BIT(0)
+#define NO_MELTDOWN BIT(1)
+#define NO_SSB BIT(2)
+#define NO_L1TF BIT(3)
+#define NO_MDS BIT(4)
+#define MSBDS_ONLY BIT(5)
+#define NO_SWAPGS BIT(6)
+#define NO_ITLB_MULTIHIT BIT(7)
+#define NO_SPECTRE_V2 BIT(8)
#define VULNWL(_vendor, _family, _model, _whitelist) \
{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
@@ -1043,26 +1027,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
/* Intel Family 6 */
- VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION),
- VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION),
- VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION),
- VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION),
- VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION),
-
- VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
+ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
VULNWL_INTEL(CORE_YONAH, NO_SSB),
- VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
+ VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS),
- VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS),
- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS),
+ VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
/*
* Technically, swapgs isn't serializing on AMD (despite it previously
@@ -1072,15 +1057,21 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
* good enough for our purposes.
*/
+ VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT),
+
/* AMD Family 0xf - 0x12 */
- VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS),
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ /* Zhaoxin Family 7 */
+ VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS),
+ VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS),
{}
};
@@ -1091,18 +1082,31 @@ static bool __init cpu_matches(unsigned long which)
return m && !!(m->driver_data & which);
}
-static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+u64 x86_read_arch_cap_msr(void)
{
u64 ia32_cap = 0;
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+
+ return ia32_cap;
+}
+
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
+ /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
+ if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
+ setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
+
if (cpu_matches(NO_SPECULATION))
return;
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
- setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
- if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+ if (!cpu_matches(NO_SPECTRE_V2))
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
@@ -1120,6 +1124,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
if (!cpu_matches(NO_SWAPGS))
setup_force_cpu_bug(X86_BUG_SWAPGS);
+ /*
+ * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when:
+ * - TSX is supported or
+ * - TSX_CTRL is present
+ *
+ * TSX_CTRL check is needed for cases when TSX could be disabled before
+ * the kernel boot e.g. kexec.
+ * TSX_CTRL check alone is not sufficient for cases when the microcode
+ * update is not present or running as guest that don't get TSX_CTRL.
+ */
+ if (!(ia32_cap & ARCH_CAP_TAA_NO) &&
+ (cpu_has(c, X86_FEATURE_RTM) ||
+ (ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
+ setup_force_cpu_bug(X86_BUG_TAA);
+
if (cpu_matches(NO_MELTDOWN))
return;
@@ -1420,6 +1439,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
#endif
c->x86_cache_alignment = c->x86_clflush_size;
memset(&c->x86_capability, 0, sizeof(c->x86_capability));
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ memset(&c->vmx_capability, 0, sizeof(c->vmx_capability));
+#endif
generic_identify(c);
@@ -1553,6 +1575,8 @@ void __init identify_boot_cpu(void)
#endif
cpu_detect_tlb(&boot_cpu_data);
setup_cr_pinning();
+
+ tsx_init();
}
void identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -1748,7 +1772,7 @@ static void wait_for_master_cpu(int cpu)
}
#ifdef CONFIG_X86_64
-static void setup_getcpu(int cpu)
+static inline void setup_getcpu(int cpu)
{
unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
struct desc_struct d = { };
@@ -1768,7 +1792,50 @@ static void setup_getcpu(int cpu)
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
}
+
+static inline void ucode_cpu_init(int cpu)
+{
+ if (cpu)
+ load_ucode_ap();
+}
+
+static inline void tss_setup_ist(struct tss_struct *tss)
+{
+ /* Set up the per-CPU TSS IST stacks */
+ tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
+ tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
+ tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
+ tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
+}
+
+#else /* CONFIG_X86_64 */
+
+static inline void setup_getcpu(int cpu) { }
+
+static inline void ucode_cpu_init(int cpu)
+{
+ show_ucode_info_early();
+}
+
+static inline void tss_setup_ist(struct tss_struct *tss) { }
+
+#endif /* !CONFIG_X86_64 */
+
+static inline void tss_setup_io_bitmap(struct tss_struct *tss)
+{
+ tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
+
+#ifdef CONFIG_X86_IOPL_IOPERM
+ tss->io_bitmap.prev_max = 0;
+ tss->io_bitmap.prev_sequence = 0;
+ memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap));
+ /*
+ * Invalidate the extra array entry past the end of the all
+ * permission bitmap as required by the hardware.
+ */
+ tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL;
#endif
+}
/*
* cpu_init() initializes state that is per-CPU. Some data is already
@@ -1776,21 +1843,15 @@ static void setup_getcpu(int cpu)
* and IDT. We reload them nevertheless, this function acts as a
* 'CPU state barrier', nothing should get across.
*/
-#ifdef CONFIG_X86_64
-
void cpu_init(void)
{
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+ struct task_struct *cur = current;
int cpu = raw_smp_processor_id();
- struct task_struct *me;
- struct tss_struct *t;
- int i;
wait_for_master_cpu(cpu);
- if (cpu)
- load_ucode_ap();
-
- t = &per_cpu(cpu_tss_rw, cpu);
+ ucode_cpu_init(cpu);
#ifdef CONFIG_NUMA
if (this_cpu_read(numa_node) == 0 &&
@@ -1799,63 +1860,47 @@ void cpu_init(void)
#endif
setup_getcpu(cpu);
- me = current;
-
pr_debug("Initializing CPU#%d\n", cpu);
- cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) ||
+ boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE))
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
/*
* Initialize the per-CPU GDT with the boot GDT,
* and set up the GDT descriptor:
*/
-
switch_to_new_gdt(cpu);
- loadsegment(fs, 0);
-
load_current_idt();
- memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
- syscall_init();
-
- wrmsrl(MSR_FS_BASE, 0);
- wrmsrl(MSR_KERNEL_GS_BASE, 0);
- barrier();
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ loadsegment(fs, 0);
+ memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+ syscall_init();
- x86_configure_nx();
- x2apic_setup();
+ wrmsrl(MSR_FS_BASE, 0);
+ wrmsrl(MSR_KERNEL_GS_BASE, 0);
+ barrier();
- /*
- * set up and load the per-CPU TSS
- */
- if (!t->x86_tss.ist[0]) {
- t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
- t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
- t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
- t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
+ x2apic_setup();
}
- t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-
- /*
- * <= is required because the CPU will access up to
- * 8 bits beyond the end of the IO permission bitmap.
- */
- for (i = 0; i <= IO_BITMAP_LONGS; i++)
- t->io_bitmap[i] = ~0UL;
-
mmgrab(&init_mm);
- me->active_mm = &init_mm;
- BUG_ON(me->mm);
+ cur->active_mm = &init_mm;
+ BUG_ON(cur->mm);
initialize_tlbstate_and_flush();
- enter_lazy_tlb(&init_mm, me);
+ enter_lazy_tlb(&init_mm, cur);
- /*
- * Initialize the TSS. sp0 points to the entry trampoline stack
- * regardless of what task is running.
- */
+ /* Initialize the TSS. */
+ tss_setup_ist(tss);
+ tss_setup_io_bitmap(tss);
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+
load_TR_desc();
+ /*
+ * sp0 points to the entry trampoline stack regardless of what task
+ * is running.
+ */
load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
load_mm_ldt(&init_mm);
@@ -1863,6 +1908,8 @@ void cpu_init(void)
clear_all_debug_regs();
dbg_restore_debug_regs();
+ doublefault_init_cpu_tss();
+
fpu__init_cpu();
if (is_uv_system())
@@ -1871,63 +1918,6 @@ void cpu_init(void)
load_fixmap_gdt(cpu);
}
-#else
-
-void cpu_init(void)
-{
- int cpu = smp_processor_id();
- struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
-
- wait_for_master_cpu(cpu);
-
- show_ucode_info_early();
-
- pr_info("Initializing CPU#%d\n", cpu);
-
- if (cpu_feature_enabled(X86_FEATURE_VME) ||
- boot_cpu_has(X86_FEATURE_TSC) ||
- boot_cpu_has(X86_FEATURE_DE))
- cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-
- load_current_idt();
- switch_to_new_gdt(cpu);
-
- /*
- * Set up and load the per-CPU TSS and LDT
- */
- mmgrab(&init_mm);
- curr->active_mm = &init_mm;
- BUG_ON(curr->mm);
- initialize_tlbstate_and_flush();
- enter_lazy_tlb(&init_mm, curr);
-
- /*
- * Initialize the TSS. sp0 points to the entry trampoline stack
- * regardless of what task is running.
- */
- set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
- load_TR_desc();
- load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
-
- load_mm_ldt(&init_mm);
-
- t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-
-#ifdef CONFIG_DOUBLEFAULT
- /* Set up doublefault TSS pointer in the GDT */
- __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
-
- clear_all_debug_regs();
- dbg_restore_debug_regs();
-
- fpu__init_cpu();
-
- load_fixmap_gdt(cpu);
-}
-#endif
-
/*
* The microcode loader calls this upon late microcode load to recheck features,
* only when microcode has been updated. Caller holds microcode_mutex and CPU
@@ -1957,3 +1947,14 @@ void microcode_check(void)
pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n");
pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
}
+
+/*
+ * Invoked from core CPU hotplug code after hotplug operations
+ */
+void arch_smt_update(void)
+{
+ /* Handle the speculative execution misfeatures */
+ cpu_bugs_smt_update();
+ /* Check whether IPI broadcasting can be enabled */
+ apic_smt_update();
+}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index c0e2407abdd6..37fdefd14f28 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -44,6 +44,22 @@ struct _tlb_table {
extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
+#ifdef CONFIG_CPU_SUP_INTEL
+enum tsx_ctrl_states {
+ TSX_CTRL_ENABLE,
+ TSX_CTRL_DISABLE,
+ TSX_CTRL_NOT_SUPPORTED,
+};
+
+extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
+
+extern void __init tsx_init(void);
+extern void tsx_enable(void);
+extern void tsx_disable(void);
+#else
+static inline void tsx_init(void) { }
+#endif /* CONFIG_CPU_SUP_INTEL */
+
extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void get_cpu_address_sizes(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
@@ -62,4 +78,10 @@ unsigned int aperfmperf_get_khz(int cpu);
extern void x86_spec_ctrl_setup_ap(void);
+extern u64 x86_read_arch_cap_msr(void);
+
+#ifdef CONFIG_IA32_FEAT_CTL
+void init_ia32_feat_ctl(struct cpuinfo_x86 *c);
+#endif
+
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index b5353244749b..3cbe24ca80ab 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -20,54 +20,55 @@ struct cpuid_dep {
* but it's difficult to tell that to the init reference checker.
*/
static const struct cpuid_dep cpuid_deps[] = {
- { X86_FEATURE_FXSR, X86_FEATURE_FPU },
- { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
- { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
- { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
- { X86_FEATURE_AVX, X86_FEATURE_XSAVE },
- { X86_FEATURE_PKU, X86_FEATURE_XSAVE },
- { X86_FEATURE_MPX, X86_FEATURE_XSAVE },
- { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
- { X86_FEATURE_CMOV, X86_FEATURE_FXSR },
- { X86_FEATURE_MMX, X86_FEATURE_FXSR },
- { X86_FEATURE_MMXEXT, X86_FEATURE_MMX },
- { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
- { X86_FEATURE_XSAVE, X86_FEATURE_FXSR },
- { X86_FEATURE_XMM, X86_FEATURE_FXSR },
- { X86_FEATURE_XMM2, X86_FEATURE_XMM },
- { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
- { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 },
- { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 },
- { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
- { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 },
- { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, },
- { X86_FEATURE_F16C, X86_FEATURE_XMM2, },
- { X86_FEATURE_AES, X86_FEATURE_XMM2 },
- { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
- { X86_FEATURE_FMA, X86_FEATURE_AVX },
- { X86_FEATURE_AVX2, X86_FEATURE_AVX, },
- { X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
- { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
- { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
- { X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
- { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
- { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
- { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
- { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
- { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC },
- { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
- { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
- { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_FXSR, X86_FEATURE_FPU },
+ { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
+ { X86_FEATURE_AVX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_PKU, X86_FEATURE_XSAVE },
+ { X86_FEATURE_MPX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
+ { X86_FEATURE_CMOV, X86_FEATURE_FXSR },
+ { X86_FEATURE_MMX, X86_FEATURE_FXSR },
+ { X86_FEATURE_MMXEXT, X86_FEATURE_MMX },
+ { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
+ { X86_FEATURE_XSAVE, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM2, X86_FEATURE_XMM },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, },
+ { X86_FEATURE_F16C, X86_FEATURE_XMM2, },
+ { X86_FEATURE_AES, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_AVX2, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VP2INTERSECT, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
{}
};
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
new file mode 100644
index 000000000000..0268185bef94
--- /dev/null
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/tboot.h>
+
+#include <asm/cpufeature.h>
+#include <asm/msr-index.h>
+#include <asm/processor.h>
+#include <asm/vmx.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "x86/cpu: " fmt
+
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+enum vmx_feature_leafs {
+ MISC_FEATURES = 0,
+ PRIMARY_CTLS,
+ SECONDARY_CTLS,
+ NR_VMX_FEATURE_WORDS,
+};
+
+#define VMX_F(x) BIT(VMX_FEATURE_##x & 0x1f)
+
+static void init_vmx_capabilities(struct cpuinfo_x86 *c)
+{
+ u32 supported, funcs, ept, vpid, ign;
+
+ BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS);
+
+ /*
+ * The high bits contain the allowed-1 settings, i.e. features that can
+ * be turned on. The low bits contain the allowed-0 settings, i.e.
+ * features that can be turned off. Ignore the allowed-0 settings,
+ * if a feature can be turned on then it's supported.
+ *
+ * Use raw rdmsr() for primary processor controls and pin controls MSRs
+ * as they exist on any CPU that supports VMX, i.e. we want the WARN if
+ * the RDMSR faults.
+ */
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, ign, supported);
+ c->vmx_capability[PRIMARY_CTLS] = supported;
+
+ rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported);
+ c->vmx_capability[SECONDARY_CTLS] = supported;
+
+ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported);
+ rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs);
+
+ /*
+ * Except for EPT+VPID, which enumerates support for both in a single
+ * MSR, low for EPT, high for VPID.
+ */
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, &ept, &vpid);
+
+ /* Pin, EPT, VPID and VM-Func are merged into a single word. */
+ WARN_ON_ONCE(supported >> 16);
+ WARN_ON_ONCE(funcs >> 4);
+ c->vmx_capability[MISC_FEATURES] = (supported & 0xffff) |
+ ((vpid & 0x1) << 16) |
+ ((funcs & 0xf) << 28);
+
+ /* EPT bits are full on scattered and must be manually handled. */
+ if (ept & VMX_EPT_EXECUTE_ONLY_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_EXECUTE_ONLY);
+ if (ept & VMX_EPT_AD_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD);
+ if (ept & VMX_EPT_1GB_PAGE_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB);
+
+ /* Synthetic APIC features that are aggregates of multiple features. */
+ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_APIC_ACCESSES)))
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(FLEXPRIORITY);
+
+ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(APIC_REGISTER_VIRT)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_INTR_DELIVERY)) &&
+ (c->vmx_capability[MISC_FEATURES] & VMX_F(POSTED_INTR)))
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(APICV);
+
+ /* Set the synthetic cpufeatures to preserve /proc/cpuinfo's ABI. */
+ if (c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR))
+ set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(FLEXPRIORITY))
+ set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(VIRTUAL_NMIS))
+ set_cpu_cap(c, X86_FEATURE_VNMI);
+ if (c->vmx_capability[SECONDARY_CTLS] & VMX_F(EPT))
+ set_cpu_cap(c, X86_FEATURE_EPT);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(EPT_AD))
+ set_cpu_cap(c, X86_FEATURE_EPT_AD);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(VPID))
+ set_cpu_cap(c, X86_FEATURE_VPID);
+}
+#endif /* CONFIG_X86_VMX_FEATURE_NAMES */
+
+void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
+{
+ bool tboot = tboot_enabled();
+ u64 msr;
+
+ if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
+ clear_cpu_cap(c, X86_FEATURE_VMX);
+ return;
+ }
+
+ if (msr & FEAT_CTL_LOCKED)
+ goto update_caps;
+
+ /*
+ * Ignore whatever value BIOS left in the MSR to avoid enabling random
+ * features or faulting on the WRMSR.
+ */
+ msr = FEAT_CTL_LOCKED;
+
+ /*
+ * Enable VMX if and only if the kernel may do VMXON at some point,
+ * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
+ * for the kernel, e.g. using VMX to hide malicious code.
+ */
+ if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) {
+ msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
+
+ if (tboot)
+ msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
+ }
+
+ wrmsrl(MSR_IA32_FEAT_CTL, msr);
+
+update_caps:
+ set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL);
+
+ if (!cpu_has(c, X86_FEATURE_VMX))
+ return;
+
+ if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) ||
+ (!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) {
+ if (IS_ENABLED(CONFIG_KVM_INTEL))
+ pr_err_once("VMX (%s TXT) disabled by BIOS\n",
+ tboot ? "inside" : "outside");
+ clear_cpu_cap(c, X86_FEATURE_VMX);
+ } else {
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ init_vmx_capabilities(c);
+#endif
+ }
+}
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 415621ddb8a2..4e28c1fc8749 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -330,12 +330,8 @@ static void init_hygon(struct cpuinfo_x86 *c)
init_hygon_cacheinfo(c);
if (cpu_has(c, X86_FEATURE_XMM2)) {
- unsigned long long val;
- int ret;
-
/*
- * A serializing LFENCE has less overhead than MFENCE, so
- * use it for execution serialization. On families which
+ * Use LFENCE for execution serialization. On families which
* don't have that MSR, LFENCE is already serializing.
* msr_set_bit() uses the safe accessors, too, even if the MSR
* is not present.
@@ -343,19 +339,8 @@ static void init_hygon(struct cpuinfo_x86 *c)
msr_set_bit(MSR_F10H_DECFG,
MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
- /*
- * Verify that the MSR write was successful (could be running
- * under a hypervisor) and only then assume that LFENCE is
- * serializing.
- */
- ret = rdmsrl_safe(MSR_F10H_DECFG, &val);
- if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) {
- /* A serializing LFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
- } else {
- /* MFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
- }
+ /* A serializing LFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
}
/*
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 8d6d92ebeb54..be82cd5841c3 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -32,41 +32,6 @@
#endif
/*
- * Just in case our CPU detection goes bad, or you have a weird system,
- * allow a way to override the automatic disabling of MPX.
- */
-static int forcempx;
-
-static int __init forcempx_setup(char *__unused)
-{
- forcempx = 1;
-
- return 1;
-}
-__setup("intel-skd-046-workaround=disable", forcempx_setup);
-
-void check_mpx_erratum(struct cpuinfo_x86 *c)
-{
- if (forcempx)
- return;
- /*
- * Turn off the MPX feature on CPUs where SMEP is not
- * available or disabled.
- *
- * Works around Intel Erratum SKD046: "Branch Instructions
- * May Initialize MPX Bound Registers Incorrectly".
- *
- * This might falsely disable MPX on systems without
- * SMEP, like Atom processors without SMEP. But there
- * is no such hardware known at the moment.
- */
- if (cpu_has(c, X86_FEATURE_MPX) && !cpu_has(c, X86_FEATURE_SMEP)) {
- setup_clear_cpu_cap(X86_FEATURE_MPX);
- pr_warn("x86/mpx: Disabling MPX since SMEP not present\n");
- }
-}
-
-/*
* Processors which have self-snooping capability can handle conflicting
* memory type across CPUs by snooping its own cache. However, there exists
* CPU models in which having conflicting memory types still leads to
@@ -142,21 +107,21 @@ struct sku_microcode {
u32 microcode;
};
static const struct sku_microcode spectre_bad_microcodes[] = {
- { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x80 },
- { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x80 },
- { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x80 },
- { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x80 },
- { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x80 },
+ { INTEL_FAM6_KABYLAKE, 0x0B, 0x80 },
+ { INTEL_FAM6_KABYLAKE, 0x0A, 0x80 },
+ { INTEL_FAM6_KABYLAKE, 0x09, 0x80 },
+ { INTEL_FAM6_KABYLAKE_L, 0x0A, 0x80 },
+ { INTEL_FAM6_KABYLAKE_L, 0x09, 0x80 },
{ INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
{ INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
- { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 },
- { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b },
- { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 },
- { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 },
+ { INTEL_FAM6_BROADWELL, 0x04, 0x28 },
+ { INTEL_FAM6_BROADWELL_G, 0x01, 0x1b },
+ { INTEL_FAM6_BROADWELL_D, 0x02, 0x14 },
+ { INTEL_FAM6_BROADWELL_D, 0x03, 0x07000011 },
{ INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 },
- { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 },
- { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 },
- { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 },
+ { INTEL_FAM6_HASWELL_L, 0x01, 0x21 },
+ { INTEL_FAM6_HASWELL_G, 0x01, 0x18 },
+ { INTEL_FAM6_HASWELL, 0x03, 0x23 },
{ INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
{ INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
{ INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
@@ -265,9 +230,10 @@ static void early_init_intel(struct cpuinfo_x86 *c)
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
if (c->x86 == 6) {
switch (c->x86_model) {
- case 0x27: /* Penwell */
- case 0x35: /* Cloverview */
- case 0x4a: /* Merrifield */
+ case INTEL_FAM6_ATOM_SALTWELL_MID:
+ case INTEL_FAM6_ATOM_SALTWELL_TABLET:
+ case INTEL_FAM6_ATOM_SILVERMONT_MID:
+ case INTEL_FAM6_ATOM_AIRMONT_NP:
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
break;
default:
@@ -329,7 +295,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
}
- check_mpx_erratum(c);
check_memory_type_self_snoop_errata(c);
/*
@@ -493,52 +458,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
-static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
-{
- /* Intel VMX MSR indicated features */
-#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
-#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
-#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
-#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
-#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
-#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
-#define x86_VMX_FEATURE_EPT_CAP_AD 0x00200000
-
- u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
- u32 msr_vpid_cap, msr_ept_cap;
-
- clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- clear_cpu_cap(c, X86_FEATURE_VNMI);
- clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- clear_cpu_cap(c, X86_FEATURE_EPT);
- clear_cpu_cap(c, X86_FEATURE_VPID);
- clear_cpu_cap(c, X86_FEATURE_EPT_AD);
-
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
- msr_ctl = vmx_msr_high | vmx_msr_low;
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
- set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
- set_cpu_cap(c, X86_FEATURE_VNMI);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx_msr_low, vmx_msr_high);
- msr_ctl2 = vmx_msr_high | vmx_msr_low;
- if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
- (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
- set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) {
- set_cpu_cap(c, X86_FEATURE_EPT);
- rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
- msr_ept_cap, msr_vpid_cap);
- if (msr_ept_cap & x86_VMX_FEATURE_EPT_CAP_AD)
- set_cpu_cap(c, X86_FEATURE_EPT_AD);
- }
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
- set_cpu_cap(c, X86_FEATURE_VPID);
- }
-}
-
#define MSR_IA32_TME_ACTIVATE 0x982
/* Helpers to access TME_ACTIVATE MSR */
@@ -754,13 +673,17 @@ static void init_intel(struct cpuinfo_x86 *c)
/* Work around errata */
srat_detect_node(c);
- if (cpu_has(c, X86_FEATURE_VMX))
- detect_vmx_virtcap(c);
+ init_ia32_feat_ctl(c);
if (cpu_has(c, X86_FEATURE_TME))
detect_tme(c);
init_intel_misc_features(c);
+
+ if (tsx_ctrl_state == TSX_CTRL_ENABLE)
+ tsx_enable();
+ if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+ tsx_disable();
}
#ifdef CONFIG_X86_32
@@ -813,7 +736,7 @@ static const struct _tlb_table intel_tlb_table[] = {
{ 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
{ 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
{ 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
- { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
+ { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" },
{ 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
@@ -841,7 +764,7 @@ static const struct _tlb_table intel_tlb_table[] = {
{ 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
{ 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
{ 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
- { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" },
+ { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" },
{ 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
{ 0x00, 0, 0 }
};
@@ -853,8 +776,8 @@ static void intel_tlb_lookup(const unsigned char desc)
return;
/* look up this descriptor in the table */
- for (k = 0; intel_tlb_table[k].descriptor != desc && \
- intel_tlb_table[k].descriptor != 0; k++)
+ for (k = 0; intel_tlb_table[k].descriptor != desc &&
+ intel_tlb_table[k].descriptor != 0; k++)
;
if (intel_tlb_table[k].tlb_type == 0)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 6ea7fdc82f3c..b3a50d962851 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -78,6 +78,7 @@ struct smca_bank_name {
static struct smca_bank_name smca_names[] = {
[SMCA_LS] = { "load_store", "Load Store Unit" },
+ [SMCA_LS_V2] = { "load_store", "Load Store Unit" },
[SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
[SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
[SMCA_DE] = { "decode_unit", "Decode Unit" },
@@ -138,6 +139,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* ZN Core (HWID=0xB0) MCA types */
{ SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFFF },
+ { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10), 0xFFFFFF },
{ SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
{ SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF },
{ SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF },
@@ -266,10 +268,10 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
smca_set_misc_banks_map(bank, cpu);
/* Return early if this bank was already initialized. */
- if (smca_banks[bank].hwid)
+ if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0)
return;
- if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
+ if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
return;
}
@@ -583,7 +585,7 @@ bool amd_filter_mce(struct mce *m)
* - Prevent possible spurious interrupts from the IF bank on Family 0x17
* Models 0x10-0x2F due to Erratum #1114.
*/
-void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
+static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
{
int i, num_msrs;
u64 hwcr;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 743370ee4983..2c4f949611e4 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -53,8 +53,6 @@
#include "internal.h"
-static DEFINE_MUTEX(mce_log_mutex);
-
/* sysfs synchronization */
static DEFINE_MUTEX(mce_sysfs_mutex);
@@ -156,19 +154,10 @@ void mce_log(struct mce *m)
if (!mce_gen_pool_add(m))
irq_work_queue(&mce_irq_work);
}
-
-void mce_inject_log(struct mce *m)
-{
- mutex_lock(&mce_log_mutex);
- mce_log(m);
- mutex_unlock(&mce_log_mutex);
-}
-EXPORT_SYMBOL_GPL(mce_inject_log);
-
-static struct notifier_block mce_srao_nb;
+EXPORT_SYMBOL_GPL(mce_log);
/*
- * We run the default notifier if we have only the SRAO, the first and the
+ * We run the default notifier if we have only the UC, the first and the
* default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
* notifiers registered on the chain.
*/
@@ -488,8 +477,9 @@ int mce_usable_address(struct mce *m)
if (!(m->status & MCI_STATUS_ADDRV))
return 0;
- /* Checks after this one are Intel-specific: */
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ /* Checks after this one are Intel/Zhaoxin-specific: */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
return 1;
if (!(m->status & MCI_STATUS_MISCV))
@@ -507,10 +497,13 @@ EXPORT_SYMBOL_GPL(mce_usable_address);
bool mce_is_memory_error(struct mce *m)
{
- if (m->cpuvendor == X86_VENDOR_AMD ||
- m->cpuvendor == X86_VENDOR_HYGON) {
+ switch (m->cpuvendor) {
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_HYGON:
return amd_mce_is_memory_error(m);
- } else if (m->cpuvendor == X86_VENDOR_INTEL) {
+
+ case X86_VENDOR_INTEL:
+ case X86_VENDOR_ZHAOXIN:
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
@@ -527,9 +520,10 @@ bool mce_is_memory_error(struct mce *m)
return (m->status & 0xef80) == BIT(7) ||
(m->status & 0xef00) == BIT(8) ||
(m->status & 0xeffc) == 0xc;
- }
- return false;
+ default:
+ return false;
+ }
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);
@@ -589,26 +583,29 @@ static struct notifier_block first_nb = {
.priority = MCE_PRIO_FIRST,
};
-static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
- void *data)
+static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
{
struct mce *mce = (struct mce *)data;
unsigned long pfn;
- if (!mce)
+ if (!mce || !mce_usable_address(mce))
return NOTIFY_DONE;
- if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
- pfn = mce->addr >> PAGE_SHIFT;
- if (!memory_failure(pfn, 0))
- set_mce_nospec(pfn);
- }
+ if (mce->severity != MCE_AO_SEVERITY &&
+ mce->severity != MCE_DEFERRED_SEVERITY)
+ return NOTIFY_DONE;
+
+ pfn = mce->addr >> PAGE_SHIFT;
+ if (!memory_failure(pfn, 0))
+ set_mce_nospec(pfn);
return NOTIFY_OK;
}
-static struct notifier_block mce_srao_nb = {
- .notifier_call = srao_decode_notifier,
- .priority = MCE_PRIO_SRAO,
+
+static struct notifier_block mce_uc_nb = {
+ .notifier_call = uc_decode_notifier,
+ .priority = MCE_PRIO_UC,
};
static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
@@ -758,26 +755,22 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
log_it:
error_seen = true;
- mce_read_aux(&m, i);
+ if (flags & MCP_DONTLOG)
+ goto clear_it;
+ mce_read_aux(&m, i);
m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
-
/*
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
- mce_log(&m);
- else if (mce_usable_address(&m)) {
- /*
- * Although we skipped logging this, we still want
- * to take action. Add to the pool so the registered
- * notifiers will see it.
- */
- if (!mce_gen_pool_add(&m))
- mce_schedule_work();
- }
+ if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
+ goto clear_it;
+
+ mce_log(&m);
+
+clear_it:
/*
* Clear state for this bank.
*/
@@ -802,7 +795,7 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
struct pt_regs *regs)
{
- char *tmp;
+ char *tmp = *msg;
int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
@@ -814,8 +807,8 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
if (quirk_no_way_out)
quirk_no_way_out(i, m, regs);
+ m->bank = i;
if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
- m->bank = i;
mce_read_aux(m, i);
*msg = tmp;
return 1;
@@ -1127,6 +1120,12 @@ static bool __mc_check_crashing_cpu(int cpu)
u64 mcgstatus;
mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
+ if (mcgstatus & MCG_STATUS_LMCES)
+ return false;
+ }
+
if (mcgstatus & MCG_STATUS_RIPV) {
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
return true;
@@ -1221,8 +1220,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
struct mca_config *cfg = &mca_cfg;
int cpu = smp_processor_id();
- char *msg = "Unknown";
struct mce m, *final;
+ char *msg = NULL;
int worst = 0;
/*
@@ -1277,9 +1276,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
/*
* Check if this MCE is signaled to only this logical processor,
- * on Intel only.
+ * on Intel, Zhaoxin only.
*/
- if (m.cpuvendor == X86_VENDOR_INTEL)
+ if (m.cpuvendor == X86_VENDOR_INTEL ||
+ m.cpuvendor == X86_VENDOR_ZHAOXIN)
lmce = m.mcgstatus & MCG_STATUS_LMCES;
/*
@@ -1353,7 +1353,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
ist_end_non_atomic();
} else {
if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
- mce_panic("Failed kernel mode recovery", &m, NULL);
+ mce_panic("Failed kernel mode recovery", &m, msg);
}
out_ist:
@@ -1697,6 +1697,18 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 6 && c->x86_model == 45)
quirk_no_way_out = quirk_sandybridge_ifu;
}
+
+ if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
+ /*
+ * All newer Zhaoxin CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = USEC_PER_SEC;
+ }
+ }
+
if (cfg->monarch_timeout < 0)
cfg->monarch_timeout = 0;
if (cfg->bootlog != 0)
@@ -1760,6 +1772,35 @@ static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
}
}
+static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+
+ /*
+ * These CPUs have MCA bank 8 which reports only one error type called
+ * SVAD (System View Address Decoder). The reporting of that error is
+ * controlled by IA32_MC8.CTL.0.
+ *
+ * If enabled, prefetching on these CPUs will cause SVAD MCE when
+ * virtual machines start and result in a system panic. Always disable
+ * bank 8 SVAD error by default.
+ */
+ if ((c->x86 == 7 && c->x86_model == 0x1b) ||
+ (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (this_cpu_read(mce_num_banks) > 8)
+ mce_banks[8].ctl = 0;
+ }
+
+ intel_init_cmci();
+ intel_init_lmce();
+ mce_adjust_timer = cmci_intel_adjust_timer;
+}
+
+static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
+{
+ intel_clear_lmce();
+}
+
static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
{
switch (c->x86_vendor) {
@@ -1781,6 +1822,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
mce_centaur_feature_init(c);
break;
+ case X86_VENDOR_ZHAOXIN:
+ mce_zhaoxin_feature_init(c);
+ break;
+
default:
break;
}
@@ -1792,6 +1837,11 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
case X86_VENDOR_INTEL:
mce_intel_feature_clear(c);
break;
+
+ case X86_VENDOR_ZHAOXIN:
+ mce_zhaoxin_feature_clear(c);
+ break;
+
default:
break;
}
@@ -1979,7 +2029,7 @@ int __init mcheck_init(void)
{
mcheck_intel_therm_init();
mce_register_decode_chain(&first_nb);
- mce_register_decode_chain(&mce_srao_nb);
+ mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();
@@ -2014,15 +2064,16 @@ static void mce_disable_error_reporting(void)
static void vendor_disable_error_reporting(void)
{
/*
- * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs
- * are socket-wide.
- * Disabling them for just a single offlined CPU is bad, since it will
- * inhibit reporting for all shared resources on the socket like the
- * last level cache (LLC), the integrated memory controller (iMC), etc.
+ * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
+ * MSRs are socket-wide. Disabling them for just a single offlined CPU
+ * is bad, since it will inhibit reporting for all shared resources on
+ * the socket like the last level cache (LLC), the integrated memory
+ * controller (iMC), etc.
*/
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
- boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
return;
mce_disable_error_reporting();
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 1f30117b24ba..3413b41b8d55 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -494,7 +494,7 @@ static void do_inject(void)
i_mce.status |= MCI_STATUS_SYNDV;
if (inj_type == SW_INJ) {
- mce_inject_log(&i_mce);
+ mce_log(&i_mce);
return;
}
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index e43eb6732630..5627b1091b85 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -85,8 +85,10 @@ static int cmci_supported(int *banks)
* initialization is vendor keyed and this
* makes sure none of the backdoors are entered otherwise.
*/
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
return 0;
+
if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
return 0;
rdmsrl(MSR_IA32_MCG_CAP, cap);
@@ -113,15 +115,16 @@ static bool lmce_supported(void)
/*
* BIOS should indicate support for LMCE by setting bit 20 in
- * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will
- * generate a #GP fault.
+ * IA32_FEAT_CTL without which touching MCG_EXT_CTL will generate a #GP
+ * fault. The MSR must also be locked for LMCE_ENABLED to take effect.
+ * WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally
+ * locks the MSR in the event that it wasn't already locked by BIOS.
*/
- rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp);
- if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) ==
- (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE))
- return true;
+ rdmsrl(MSR_IA32_FEAT_CTL, tmp);
+ if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED)))
+ return false;
- return false;
+ return tmp & FEAT_CTL_LMCE_ENABLED;
}
bool mce_intel_cmci_poll(void)
@@ -423,7 +426,7 @@ void cmci_disable_bank(int bank)
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
-static void intel_init_cmci(void)
+void intel_init_cmci(void)
{
int banks;
@@ -442,7 +445,7 @@ static void intel_init_cmci(void)
cmci_recheck();
}
-static void intel_init_lmce(void)
+void intel_init_lmce(void)
{
u64 val;
@@ -455,7 +458,7 @@ static void intel_init_lmce(void)
wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
}
-static void intel_clear_lmce(void)
+void intel_clear_lmce(void)
{
u64 val;
@@ -479,9 +482,10 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
switch (c->x86_model) {
case INTEL_FAM6_IVYBRIDGE_X:
case INTEL_FAM6_HASWELL_X:
- case INTEL_FAM6_BROADWELL_XEON_D:
+ case INTEL_FAM6_BROADWELL_D:
case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_ICELAKE_X:
case INTEL_FAM6_XEON_PHI_KNL:
case INTEL_FAM6_XEON_PHI_KNM:
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 43031db429d2..b785c0d0b590 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -45,11 +45,17 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval);
bool mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
+void intel_init_cmci(void);
+void intel_init_lmce(void);
+void intel_clear_lmce(void);
#else
# define cmci_intel_adjust_timer mce_adjust_timer_default
static inline bool mce_intel_cmci_poll(void) { return false; }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
+static inline void intel_init_cmci(void) { }
+static inline void intel_init_lmce(void) { }
+static inline void intel_clear_lmce(void) { }
#endif
void mce_timer_kick(unsigned long interval);
@@ -78,8 +84,6 @@ static inline int apei_clear_mce(u64 record_id)
}
#endif
-void mce_inject_log(struct mce *m);
-
/*
* We consider records to be equivalent if bank+status+addr+misc all match.
* This is only used when the system is going down because of a fatal error
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 210f1f5db5f7..87bcdc6dc2f0 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -107,11 +107,11 @@ static struct severity {
*/
MCESEV(
AO, "Action optional: memory scrubbing error",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
+ SER, MASK(MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
),
MCESEV(
AO, "Action optional: last level cache writeback error",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
+ SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
),
/* ignore OVER for UCNA */
diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c
index 6e2becf547c5..58b4ee3cda77 100644
--- a/arch/x86/kernel/cpu/mce/therm_throt.c
+++ b/arch/x86/kernel/cpu/mce/therm_throt.c
@@ -40,15 +40,58 @@
#define THERMAL_THROTTLING_EVENT 0
#define POWER_LIMIT_EVENT 1
-/*
- * Current thermal event state:
+/**
+ * struct _thermal_state - Represent the current thermal event state
+ * @next_check: Stores the next timestamp, when it is allowed
+ * to log the next warning message.
+ * @last_interrupt_time: Stores the timestamp for the last threshold
+ * high event.
+ * @therm_work: Delayed workqueue structure
+ * @count: Stores the current running count for thermal
+ * or power threshold interrupts.
+ * @last_count: Stores the previous running count for thermal
+ * or power threshold interrupts.
+ * @max_time_ms: This shows the maximum amount of time CPU was
+ * in throttled state for a single thermal
+ * threshold high to low state.
+ * @total_time_ms: This is a cumulative time during which CPU was
+ * in the throttled state.
+ * @rate_control_active: Set when a throttling message is logged.
+ * This is used for the purpose of rate-control.
+ * @new_event: Stores the last high/low status of the
+ * THERM_STATUS_PROCHOT or
+ * THERM_STATUS_POWER_LIMIT.
+ * @level: Stores whether this _thermal_state instance is
+ * for a CORE level or for PACKAGE level.
+ * @sample_index: Index for storing the next sample in the buffer
+ * temp_samples[].
+ * @sample_count: Total number of samples collected in the buffer
+ * temp_samples[].
+ * @average: The last moving average of temperature samples
+ * @baseline_temp: Temperature at which thermal threshold high
+ * interrupt was generated.
+ * @temp_samples: Storage for temperature samples to calculate
+ * moving average.
+ *
+ * This structure is used to represent data related to thermal state for a CPU.
+ * There is a separate storage for core and package level for each CPU.
*/
struct _thermal_state {
- bool new_event;
- int event;
u64 next_check;
+ u64 last_interrupt_time;
+ struct delayed_work therm_work;
unsigned long count;
unsigned long last_count;
+ unsigned long max_time_ms;
+ unsigned long total_time_ms;
+ bool rate_control_active;
+ bool new_event;
+ u8 level;
+ u8 sample_index;
+ u8 sample_count;
+ u8 average;
+ u8 baseline_temp;
+ u8 temp_samples[3];
};
struct thermal_state {
@@ -121,8 +164,22 @@ define_therm_throt_device_one_ro(package_throttle_count);
define_therm_throt_device_show_func(package_power_limit, count);
define_therm_throt_device_one_ro(package_power_limit_count);
+define_therm_throt_device_show_func(core_throttle, max_time_ms);
+define_therm_throt_device_one_ro(core_throttle_max_time_ms);
+
+define_therm_throt_device_show_func(package_throttle, max_time_ms);
+define_therm_throt_device_one_ro(package_throttle_max_time_ms);
+
+define_therm_throt_device_show_func(core_throttle, total_time_ms);
+define_therm_throt_device_one_ro(core_throttle_total_time_ms);
+
+define_therm_throt_device_show_func(package_throttle, total_time_ms);
+define_therm_throt_device_one_ro(package_throttle_total_time_ms);
+
static struct attribute *thermal_throttle_attrs[] = {
&dev_attr_core_throttle_count.attr,
+ &dev_attr_core_throttle_max_time_ms.attr,
+ &dev_attr_core_throttle_total_time_ms.attr,
NULL
};
@@ -135,6 +192,112 @@ static const struct attribute_group thermal_attr_group = {
#define CORE_LEVEL 0
#define PACKAGE_LEVEL 1
+#define THERM_THROT_POLL_INTERVAL HZ
+#define THERM_STATUS_PROCHOT_LOG BIT(1)
+
+#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15))
+#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11))
+
+static void clear_therm_status_log(int level)
+{
+ int msr;
+ u64 mask, msr_val;
+
+ if (level == CORE_LEVEL) {
+ msr = MSR_IA32_THERM_STATUS;
+ mask = THERM_STATUS_CLEAR_CORE_MASK;
+ } else {
+ msr = MSR_IA32_PACKAGE_THERM_STATUS;
+ mask = THERM_STATUS_CLEAR_PKG_MASK;
+ }
+
+ rdmsrl(msr, msr_val);
+ msr_val &= mask;
+ wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG);
+}
+
+static void get_therm_status(int level, bool *proc_hot, u8 *temp)
+{
+ int msr;
+ u64 msr_val;
+
+ if (level == CORE_LEVEL)
+ msr = MSR_IA32_THERM_STATUS;
+ else
+ msr = MSR_IA32_PACKAGE_THERM_STATUS;
+
+ rdmsrl(msr, msr_val);
+ if (msr_val & THERM_STATUS_PROCHOT_LOG)
+ *proc_hot = true;
+ else
+ *proc_hot = false;
+
+ *temp = (msr_val >> 16) & 0x7F;
+}
+
+static void __maybe_unused throttle_active_work(struct work_struct *work)
+{
+ struct _thermal_state *state = container_of(to_delayed_work(work),
+ struct _thermal_state, therm_work);
+ unsigned int i, avg, this_cpu = smp_processor_id();
+ u64 now = get_jiffies_64();
+ bool hot;
+ u8 temp;
+
+ get_therm_status(state->level, &hot, &temp);
+ /* temperature value is offset from the max so lesser means hotter */
+ if (!hot && temp > state->baseline_temp) {
+ if (state->rate_control_active)
+ pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n",
+ this_cpu,
+ state->level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
+
+ state->rate_control_active = false;
+ return;
+ }
+
+ if (time_before64(now, state->next_check) &&
+ state->rate_control_active)
+ goto re_arm;
+
+ state->next_check = now + CHECK_INTERVAL;
+
+ if (state->count != state->last_count) {
+ /* There was one new thermal interrupt */
+ state->last_count = state->count;
+ state->average = 0;
+ state->sample_count = 0;
+ state->sample_index = 0;
+ }
+
+ state->temp_samples[state->sample_index] = temp;
+ state->sample_count++;
+ state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples);
+ if (state->sample_count < ARRAY_SIZE(state->temp_samples))
+ goto re_arm;
+
+ avg = 0;
+ for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i)
+ avg += state->temp_samples[i];
+
+ avg /= ARRAY_SIZE(state->temp_samples);
+
+ if (state->average > avg) {
+ pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n",
+ this_cpu,
+ state->level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
+ state->rate_control_active = true;
+ }
+
+ state->average = avg;
+
+re_arm:
+ clear_therm_status_log(state->level);
+ schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
+}
+
/***
* therm_throt_process - Process thermal throttling event from interrupt
* @curr: Whether the condition is current or not (boolean), since the
@@ -178,27 +341,33 @@ static void therm_throt_process(bool new_event, int event, int level)
if (new_event)
state->count++;
- if (time_before64(now, state->next_check) &&
- state->count != state->last_count)
+ if (event != THERMAL_THROTTLING_EVENT)
return;
- state->next_check = now + CHECK_INTERVAL;
- state->last_count = state->count;
+ if (new_event && !state->last_interrupt_time) {
+ bool hot;
+ u8 temp;
+
+ get_therm_status(state->level, &hot, &temp);
+ /*
+ * Ignore short temperature spike as the system is not close
+ * to PROCHOT. 10C offset is large enough to ignore. It is
+ * already dropped from the high threshold temperature.
+ */
+ if (temp > 10)
+ return;
- /* if we just entered the thermal event */
- if (new_event) {
- if (event == THERMAL_THROTTLING_EVENT)
- pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
- this_cpu,
- level == CORE_LEVEL ? "Core" : "Package",
- state->count);
- return;
- }
- if (old_event) {
- if (event == THERMAL_THROTTLING_EVENT)
- pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
- level == CORE_LEVEL ? "Core" : "Package");
- return;
+ state->baseline_temp = temp;
+ state->last_interrupt_time = now;
+ schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
+ } else if (old_event && state->last_interrupt_time) {
+ unsigned long throttle_time;
+
+ throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time);
+ if (throttle_time > state->max_time_ms)
+ state->max_time_ms = throttle_time;
+ state->total_time_ms += throttle_time;
+ state->last_interrupt_time = 0;
}
}
@@ -244,20 +413,47 @@ static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
if (err)
return err;
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_core_power_limit_count.attr,
thermal_attr_group.name);
+ if (err)
+ goto del_group;
+ }
+
if (cpu_has(c, X86_FEATURE_PTS)) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_throttle_count.attr,
thermal_attr_group.name);
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ if (err)
+ goto del_group;
+
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_throttle_max_time_ms.attr,
+ thermal_attr_group.name);
+ if (err)
+ goto del_group;
+
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_throttle_total_time_ms.attr,
+ thermal_attr_group.name);
+ if (err)
+ goto del_group;
+
+ if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_power_limit_count.attr,
thermal_attr_group.name);
+ if (err)
+ goto del_group;
+ }
}
+ return 0;
+
+del_group:
+ sysfs_remove_group(&dev->kobj, &thermal_attr_group);
+
return err;
}
@@ -269,15 +465,34 @@ static void thermal_throttle_remove_dev(struct device *dev)
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int thermal_throttle_online(unsigned int cpu)
{
+ struct thermal_state *state = &per_cpu(thermal_state, cpu);
struct device *dev = get_cpu_device(cpu);
+ u32 l;
+
+ state->package_throttle.level = PACKAGE_LEVEL;
+ state->core_throttle.level = CORE_LEVEL;
+
+ INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work);
+ INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work);
+
+ /* Unmask the thermal vector after the above workqueues are initialized. */
+ l = apic_read(APIC_LVTTHMR);
+ apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
return thermal_throttle_add_dev(dev, cpu);
}
static int thermal_throttle_offline(unsigned int cpu)
{
+ struct thermal_state *state = &per_cpu(thermal_state, cpu);
struct device *dev = get_cpu_device(cpu);
+ cancel_delayed_work(&state->package_throttle.therm_work);
+ cancel_delayed_work(&state->core_throttle.therm_work);
+
+ state->package_throttle.rate_control_active = false;
+ state->core_throttle.rate_control_active = false;
+
thermal_throttle_remove_dev(dev);
return 0;
}
@@ -512,10 +727,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
- /* Unmask the thermal vector: */
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-
pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
tm2 ? "TM2" : "TM1");
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index a0e52bd00ecc..3f6b137ef4e6 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -567,7 +567,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
void reload_ucode_amd(void)
{
struct microcode_amd *mc;
- u32 rev, dummy;
+ u32 rev, dummy __always_unused;
mc = (struct microcode_amd *)amd_ucode_patch;
@@ -673,7 +673,7 @@ static enum ucode_state apply_microcode_amd(int cpu)
struct ucode_cpu_info *uci;
struct ucode_patch *p;
enum ucode_state ret;
- u32 rev, dummy;
+ u32 rev, dummy __always_unused;
BUG_ON(raw_smp_processor_id() != cpu);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index cb0fdcaf1415..7019d4b2df0c 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -63,11 +63,6 @@ LIST_HEAD(microcode_cache);
*/
static DEFINE_MUTEX(microcode_mutex);
-/*
- * Serialize late loading so that CPUs get updated one-by-one.
- */
-static DEFINE_RAW_SPINLOCK(update_lock);
-
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
struct cpu_info_ctx {
@@ -566,11 +561,18 @@ static int __reload_late(void *info)
if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
return -1;
- raw_spin_lock(&update_lock);
- apply_microcode_local(&err);
- raw_spin_unlock(&update_lock);
+ /*
+ * On an SMT system, it suffices to load the microcode on one sibling of
+ * the core because the microcode engine is shared between the threads.
+ * Synchronization still needs to take place so that no concurrent
+ * loading attempts happen on multiple threads of an SMT core. See
+ * below.
+ */
+ if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu)
+ apply_microcode_local(&err);
+ else
+ goto wait_for_siblings;
- /* siblings return UCODE_OK because their engine got updated already */
if (err > UCODE_NFOUND) {
pr_warn("Error reloading microcode on CPU %d\n", cpu);
ret = -1;
@@ -578,14 +580,18 @@ static int __reload_late(void *info)
ret = 1;
}
+wait_for_siblings:
+ if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC))
+ panic("Timeout during microcode update!\n");
+
/*
- * Increase the wait timeout to a safe value here since we're
- * serializing the microcode update and that could take a while on a
- * large number of CPUs. And that is fine as the *actual* timeout will
- * be determined by the last CPU finished updating and thus cut short.
+ * At least one thread has completed update on each core.
+ * For others, simply call the update to make sure the
+ * per-cpu cpuinfo can be updated with right microcode
+ * revision.
*/
- if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus()))
- panic("Timeout during microcode update!\n");
+ if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu)
+ apply_microcode_local(&err);
return ret;
}
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index ce799cfe9434..6a99535d7f37 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -791,6 +791,7 @@ static enum ucode_state apply_microcode_intel(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
struct microcode_intel *mc;
enum ucode_state ret;
static int prev_rev;
@@ -836,7 +837,7 @@ static enum ucode_state apply_microcode_intel(int cpu)
return UCODE_ERROR;
}
- if (rev != prev_rev) {
+ if (bsp && rev != prev_rev) {
pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
rev,
mc->hdr.date & 0xffff,
@@ -852,7 +853,7 @@ out:
c->microcode = rev;
/* Update boot_cpu_data's revision too, if we're on the BSP: */
- if (c->cpu_index == boot_cpu_data.cpu_index)
+ if (bsp)
boot_cpu_data.microcode = rev;
return ret;
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index aed45b8895d5..1db560ed2ca3 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -6,8 +6,7 @@
set -e
-IN=$1
-OUT=$2
+OUT=$1
dump_array()
{
@@ -15,6 +14,7 @@ dump_array()
SIZE=$2
PFX=$3
POSTFIX=$4
+ IN=$5
PFX_SZ=$(echo $PFX | wc -c)
TABS="$(printf '\t\t\t\t\t')"
@@ -57,11 +57,18 @@ trap 'rm "$OUT"' EXIT
echo "#endif"
echo ""
- dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" ""
+ dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" "" $2
echo ""
- dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32"
+ dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32" $2
+ echo ""
+ echo "#ifdef CONFIG_X86_VMX_FEATURE_NAMES"
+ echo "#ifndef _ASM_X86_VMXFEATURES_H"
+ echo "#include <asm/vmxfeatures.h>"
+ echo "#endif"
+ dump_array "x86_vmx_flags" "NVMXINTS*32" "VMX_FEATURE_" "" $3
+ echo "#endif /* CONFIG_X86_VMX_FEATURE_NAMES */"
) > $OUT
trap - EXIT
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 062f77279ce3..caa032ce3fe3 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -29,6 +29,7 @@
#include <asm/timer.h>
#include <asm/reboot.h>
#include <asm/nmi.h>
+#include <clocksource/hyperv_timer.h>
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
@@ -215,6 +216,10 @@ static void __init ms_hyperv_init_platform(void)
int hv_host_info_ecx;
int hv_host_info_edx;
+#ifdef CONFIG_PARAVIRT
+ pv_info.name = "Hyper-V";
+#endif
+
/*
* Extract the features and hints
*/
@@ -285,7 +290,12 @@ static void __init ms_hyperv_init_platform(void)
machine_ops.shutdown = hv_machine_shutdown;
machine_ops.crash_shutdown = hv_machine_crash_shutdown;
#endif
- mark_tsc_unstable("running on Hyper-V");
+ if (ms_hyperv.features & HV_X64_ACCESS_TSC_INVARIANT) {
+ wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ } else {
+ mark_tsc_unstable("running on Hyper-V");
+ }
/*
* Generation 2 instances don't support reading the NMI status from
@@ -338,6 +348,15 @@ static void __init ms_hyperv_init_platform(void)
x2apic_phys = 1;
# endif
+ /* Register Hyper-V specific clocksource */
+ hv_init_clocksource();
+#endif
+}
+
+void hv_setup_sched_clock(void *sched_clock)
+{
+#ifdef CONFIG_PARAVIRT
+ pv_ops.time.sched_clock = sched_clock;
#endif
}
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index aa5c064a6a22..51b9190c628b 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -15,7 +15,7 @@
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include "mtrr.h"
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 4d36dcc1cf87..a5c506f6da7f 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -101,9 +101,6 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
int length;
size_t linelen;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
memset(line, 0, LINE_SIZE);
len = min_t(size_t, len, LINE_SIZE - 1);
@@ -226,8 +223,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 0);
@@ -236,24 +231,18 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
break;
case MTRRIOC_DEL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 0);
break;
case MTRRIOC_KILL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_del(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_ENTRY:
@@ -279,8 +268,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 1);
@@ -289,8 +276,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
break;
@@ -298,16 +283,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 1);
break;
case MTRRIOC_KILL_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_del_page(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_PAGE_ENTRY:
@@ -373,28 +354,6 @@ static int mtrr_close(struct inode *ino, struct file *file)
return single_release(ino, file);
}
-static int mtrr_seq_show(struct seq_file *seq, void *offset);
-
-static int mtrr_open(struct inode *inode, struct file *file)
-{
- if (!mtrr_if)
- return -EIO;
- if (!mtrr_if->get)
- return -ENXIO;
- return single_open(file, mtrr_seq_show, NULL);
-}
-
-static const struct file_operations mtrr_fops = {
- .owner = THIS_MODULE,
- .open = mtrr_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = mtrr_write,
- .unlocked_ioctl = mtrr_ioctl,
- .compat_ioctl = mtrr_ioctl,
- .release = mtrr_close,
-};
-
static int mtrr_seq_show(struct seq_file *seq, void *offset)
{
char factor;
@@ -426,6 +385,29 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
return 0;
}
+static int mtrr_open(struct inode *inode, struct file *file)
+{
+ if (!mtrr_if)
+ return -EIO;
+ if (!mtrr_if->get)
+ return -ENXIO;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ return single_open(file, mtrr_seq_show, NULL);
+}
+
+static const struct proc_ops mtrr_proc_ops = {
+ .proc_open = mtrr_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = mtrr_write,
+ .proc_ioctl = mtrr_ioctl,
+#ifdef CONFIG_COMPAT
+ .proc_compat_ioctl = mtrr_ioctl,
+#endif
+ .proc_release = mtrr_close,
+};
+
static int __init mtrr_if_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -436,7 +418,7 @@ static int __init mtrr_if_init(void)
(!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
return -ENODEV;
- proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
+ proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_proc_ops);
return 0;
}
arch_initcall(mtrr_if_init);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 507039c20128..6a80f36b5d59 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -52,7 +52,7 @@
#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include "mtrr.h"
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index cb2e49810d68..4eec8889b0ff 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -7,6 +7,10 @@
#include "cpu.h"
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+extern const char * const x86_vmx_flags[NVMXINTS*32];
+#endif
+
/*
* Get CPU information for use by the procfs.
*/
@@ -102,6 +106,17 @@ static int show_cpuinfo(struct seq_file *m, void *v)
if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
seq_printf(m, " %s", x86_cap_flags[i]);
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ if (cpu_has(c, X86_FEATURE_VMX) && c->vmx_capability[0]) {
+ seq_puts(m, "\nvmx flags\t:");
+ for (i = 0; i < 32*NVMXINTS; i++) {
+ if (test_bit(i, (unsigned long *)c->vmx_capability) &&
+ x86_vmx_flags[i] != NULL)
+ seq_printf(m, " %s", x86_vmx_flags[i]);
+ }
+ }
+#endif
+
seq_puts(m, "\nbugs\t\t:");
for (i = 0; i < 32*NBUGINTS; i++) {
unsigned int bug_bit = 32*NCAPINTS + i;
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 5c900f9527ff..c4be62058dd9 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -29,7 +29,8 @@ __setup("nordrand", x86_rdrand_setup);
#ifdef CONFIG_ARCH_RANDOM
void x86_init_rdrand(struct cpuinfo_x86 *c)
{
- unsigned long tmp;
+ unsigned int changed = 0;
+ unsigned long tmp, prev;
int i;
if (!cpu_has(c, X86_FEATURE_RDRAND))
@@ -42,5 +43,24 @@ void x86_init_rdrand(struct cpuinfo_x86 *c)
return;
}
}
+
+ /*
+ * Stupid sanity-check whether RDRAND does *actually* generate
+ * some at least random-looking data.
+ */
+ prev = tmp;
+ for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
+ if (rdrand_long(&tmp)) {
+ if (prev != tmp)
+ changed++;
+
+ prev = tmp;
+ }
+ }
+
+ if (WARN_ON_ONCE(!changed))
+ pr_emerg(
+"RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\"");
+
}
#endif
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 03eb90d00af0..89049b343c7a 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -618,7 +618,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
if (static_branch_unlikely(&rdt_mon_enable_key))
rmdir_mondata_subdir_allrdtgrp(r, d->id);
list_del(&d->list);
- if (is_mbm_enabled())
+ if (r->mon_capable && is_mbm_enabled())
cancel_delayed_work(&d->mbm_over);
if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) {
/*
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index efbd54cc4e69..055c8613b531 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -522,6 +522,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
int ret = 0;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ ret = -ENOENT;
+ goto out;
+ }
md.priv = of->kn->priv;
resid = md.u.rid;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index e49b77283924..181c992f448c 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -57,6 +57,7 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
}
DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
/**
* struct mon_evt - Entry in the event list of a resource
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 397206f23d14..773124b0e18a 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -514,7 +514,7 @@ void mbm_handle_overflow(struct work_struct *work)
mutex_lock(&rdtgroup_mutex);
- if (!static_branch_likely(&rdt_enable_key))
+ if (!static_branch_likely(&rdt_mon_enable_key))
goto out_unlock;
d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
@@ -543,7 +543,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
unsigned long delay = msecs_to_jiffies(delay_ms);
int cpu;
- if (!static_branch_likely(&rdt_enable_key))
+ if (!static_branch_likely(&rdt_mon_enable_key))
return;
cpu = cpumask_any(&dom->cpu_mask);
dom->mbm_work_cpu = cpu;
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index a46dee8e78db..064e9ef44cd6 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -461,10 +461,8 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
}
rdtgrp = rdtgroup_kn_lock_live(of->kn);
- rdt_last_cmd_clear();
if (!rdtgrp) {
ret = -ENOENT;
- rdt_last_cmd_puts("Directory was removed\n");
goto unlock;
}
@@ -534,11 +532,15 @@ static void move_myself(struct callback_head *head)
kfree(rdtgrp);
}
+ if (unlikely(current->flags & PF_EXITING))
+ goto out;
+
preempt_disable();
/* update PQR_ASSOC MSR to make resource group go into effect */
resctrl_sched_in();
preempt_enable();
+out:
kfree(callback);
}
@@ -727,6 +729,92 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
return ret;
}
+#ifdef CONFIG_PROC_CPU_RESCTRL
+
+/*
+ * A task can only be part of one resctrl control group and of one monitor
+ * group which is associated to that control group.
+ *
+ * 1) res:
+ * mon:
+ *
+ * resctrl is not available.
+ *
+ * 2) res:/
+ * mon:
+ *
+ * Task is part of the root resctrl control group, and it is not associated
+ * to any monitor group.
+ *
+ * 3) res:/
+ * mon:mon0
+ *
+ * Task is part of the root resctrl control group and monitor group mon0.
+ *
+ * 4) res:group0
+ * mon:
+ *
+ * Task is part of resctrl control group group0, and it is not associated
+ * to any monitor group.
+ *
+ * 5) res:group0
+ * mon:mon1
+ *
+ * Task is part of resctrl control group group0 and monitor group mon1.
+ */
+int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
+{
+ struct rdtgroup *rdtg;
+ int ret = 0;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ /* Return empty if resctrl has not been mounted. */
+ if (!static_branch_unlikely(&rdt_enable_key)) {
+ seq_puts(s, "res:\nmon:\n");
+ goto unlock;
+ }
+
+ list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
+ struct rdtgroup *crg;
+
+ /*
+ * Task information is only relevant for shareable
+ * and exclusive groups.
+ */
+ if (rdtg->mode != RDT_MODE_SHAREABLE &&
+ rdtg->mode != RDT_MODE_EXCLUSIVE)
+ continue;
+
+ if (rdtg->closid != tsk->closid)
+ continue;
+
+ seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
+ rdtg->kn->name);
+ seq_puts(s, "mon:");
+ list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
+ mon.crdtgrp_list) {
+ if (tsk->rmid != crg->mon.rmid)
+ continue;
+ seq_printf(s, "%s", crg->kn->name);
+ break;
+ }
+ seq_putc(s, '\n');
+ goto unlock;
+ }
+ /*
+ * The above search should succeed. Otherwise return
+ * with an error.
+ */
+ ret = -ENOENT;
+unlock:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret;
+}
+#endif
+
static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
@@ -1743,9 +1831,6 @@ static int set_cache_qos_cfg(int level, bool enable)
struct rdt_domain *d;
int cpu;
- if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
- return -ENOMEM;
-
if (level == RDT_RESOURCE_L3)
update = l3_qos_cfg_update;
else if (level == RDT_RESOURCE_L2)
@@ -1753,6 +1838,9 @@ static int set_cache_qos_cfg(int level, bool enable)
else
return -EINVAL;
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
r_l = &rdt_resources_all[level];
list_for_each_entry(d, &r_l->domains, list) {
/* Pick one CPU from each domain instance to update MSR */
@@ -1972,7 +2060,7 @@ static int rdt_get_tree(struct fs_context *fc)
if (rdt_mon_capable) {
ret = mongroup_create_dir(rdtgroup_default.kn,
- NULL, "mon_groups",
+ &rdtgroup_default, "mon_groups",
&kn_mongrp);
if (ret < 0)
goto out_info;
@@ -2039,25 +2127,20 @@ enum rdt_param {
nr__rdt_params
};
-static const struct fs_parameter_spec rdt_param_specs[] = {
+static const struct fs_parameter_spec rdt_fs_parameters[] = {
fsparam_flag("cdp", Opt_cdp),
fsparam_flag("cdpl2", Opt_cdpl2),
fsparam_flag("mba_MBps", Opt_mba_mbps),
{}
};
-static const struct fs_parameter_description rdt_fs_parameters = {
- .name = "rdt",
- .specs = rdt_param_specs,
-};
-
static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct rdt_fs_context *ctx = rdt_fc2context(fc);
struct fs_parse_result result;
int opt;
- opt = fs_parse(fc, &rdt_fs_parameters, param, &result);
+ opt = fs_parse(fc, rdt_fs_parameters, param, &result);
if (opt < 0)
return opt;
@@ -2207,7 +2290,11 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
free_rmid(sentry->mon.rmid);
list_del(&sentry->mon.crdtgrp_list);
- kfree(sentry);
+
+ if (atomic_read(&sentry->waitcount) != 0)
+ sentry->flags = RDT_DELETED;
+ else
+ kfree(sentry);
}
}
@@ -2245,7 +2332,11 @@ static void rmdir_all_sub(void)
kernfs_remove(rdtgrp->kn);
list_del(&rdtgrp->rdtgroup_list);
- kfree(rdtgrp);
+
+ if (atomic_read(&rdtgrp->waitcount) != 0)
+ rdtgrp->flags = RDT_DELETED;
+ else
+ kfree(rdtgrp);
}
/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
update_closid_rmid(cpu_online_mask, &rdtgroup_default);
@@ -2282,7 +2373,7 @@ static void rdt_kill_sb(struct super_block *sb)
static struct file_system_type rdt_fs_type = {
.name = "resctrl",
.init_fs_context = rdt_init_fs_context,
- .parameters = &rdt_fs_parameters,
+ .parameters = rdt_fs_parameters,
.kill_sb = rdt_kill_sb,
};
@@ -2448,7 +2539,7 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn,
/*
* Create the mon_data directory first.
*/
- ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
+ ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
if (ret)
return ret;
@@ -2638,7 +2729,6 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
}
static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
- struct kernfs_node *prgrp_kn,
const char *name, umode_t mode,
enum rdt_group_type rtype, struct rdtgroup **r)
{
@@ -2647,11 +2737,9 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
uint files = 0;
int ret;
- prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
- rdt_last_cmd_clear();
+ prdtgrp = rdtgroup_kn_lock_live(parent_kn);
if (!prdtgrp) {
ret = -ENODEV;
- rdt_last_cmd_puts("Directory was removed\n");
goto out_unlock;
}
@@ -2722,7 +2810,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
kernfs_activate(kn);
/*
- * The caller unlocks the prgrp_kn upon success.
+ * The caller unlocks the parent_kn upon success.
*/
return 0;
@@ -2733,7 +2821,7 @@ out_destroy:
out_free_rgrp:
kfree(rdtgrp);
out_unlock:
- rdtgroup_kn_unlock(prgrp_kn);
+ rdtgroup_kn_unlock(parent_kn);
return ret;
}
@@ -2750,15 +2838,12 @@ static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
* to monitor a subset of tasks and cpus in its parent ctrl_mon group.
*/
static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
- struct kernfs_node *prgrp_kn,
- const char *name,
- umode_t mode)
+ const char *name, umode_t mode)
{
struct rdtgroup *rdtgrp, *prgrp;
int ret;
- ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
- &rdtgrp);
+ ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
if (ret)
return ret;
@@ -2771,7 +2856,7 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
*/
list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
- rdtgroup_kn_unlock(prgrp_kn);
+ rdtgroup_kn_unlock(parent_kn);
return ret;
}
@@ -2780,7 +2865,6 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
* to allocate and monitor resources.
*/
static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
- struct kernfs_node *prgrp_kn,
const char *name, umode_t mode)
{
struct rdtgroup *rdtgrp;
@@ -2788,8 +2872,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
u32 closid;
int ret;
- ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
- &rdtgrp);
+ ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
if (ret)
return ret;
@@ -2814,7 +2897,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
* Create an empty mon_groups directory to hold the subset
* of tasks and cpus to monitor.
*/
- ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
+ ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
if (ret) {
rdt_last_cmd_puts("kernfs subdir error\n");
goto out_del_list;
@@ -2830,7 +2913,7 @@ out_id_free:
out_common_fail:
mkdir_rdt_prepare_clean(rdtgrp);
out_unlock:
- rdtgroup_kn_unlock(prgrp_kn);
+ rdtgroup_kn_unlock(parent_kn);
return ret;
}
@@ -2863,14 +2946,14 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
* subdirectory
*/
if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
- return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
+ return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
/*
* If RDT monitoring is supported and the parent directory is a valid
* "mon_groups" directory, add a monitoring subdirectory.
*/
if (rdt_mon_capable && is_mon_groups(parent_kn, name))
- return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
+ return rdtgroup_mkdir_mon(parent_kn, name, mode);
return -EPERM;
}
@@ -2956,13 +3039,13 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
closid_free(rdtgrp->closid);
free_rmid(rdtgrp->mon.rmid);
+ rdtgroup_ctrl_remove(kn, rdtgrp);
+
/*
* Free all the child monitor group rmids.
*/
free_all_child_rdtgrp(rdtgrp);
- rdtgroup_ctrl_remove(kn, rdtgrp);
-
return 0;
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index adf9b71386ef..62b137c3c97a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -4,7 +4,7 @@
*/
#include <linux/cpu.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/apic.h>
#include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index ee48c3fc8a65..d3a0791bc052 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -7,7 +7,7 @@
#include <linux/cpu.h>
#include <asm/apic.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/processor.h>
#include "cpu.h"
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
new file mode 100644
index 000000000000..e2ad30e474f8
--- /dev/null
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Transactional Synchronization Extensions (TSX) control.
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author:
+ * Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
+ */
+
+#include <linux/cpufeature.h>
+
+#include <asm/cmdline.h>
+
+#include "cpu.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tsx: " fmt
+
+enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
+
+void tsx_disable(void)
+{
+ u64 tsx;
+
+ rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Force all transactions to immediately abort */
+ tsx |= TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is not enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * do not waste resources trying TSX transactions that
+ * will always abort.
+ */
+ tsx |= TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+}
+
+void tsx_enable(void)
+{
+ u64 tsx;
+
+ rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Enable the RTM feature in the cpu */
+ tsx &= ~TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * can enumerate and use the TSX feature.
+ */
+ tsx &= ~TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+}
+
+static bool __init tsx_ctrl_is_supported(void)
+{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
+ /*
+ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
+ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
+ *
+ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
+ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
+ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
+ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
+ * tsx= cmdline requests will do nothing on CPUs without
+ * MSR_IA32_TSX_CTRL support.
+ */
+ return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR);
+}
+
+static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_TAA))
+ return TSX_CTRL_DISABLE;
+
+ return TSX_CTRL_ENABLE;
+}
+
+void __init tsx_init(void)
+{
+ char arg[5] = {};
+ int ret;
+
+ if (!tsx_ctrl_is_supported())
+ return;
+
+ ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));
+ if (ret >= 0) {
+ if (!strcmp(arg, "on")) {
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ } else if (!strcmp(arg, "off")) {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ } else if (!strcmp(arg, "auto")) {
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
+ } else {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ pr_err("invalid option, defaulting to off\n");
+ }
+ } else {
+ /* tsx= not provided */
+ if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO))
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
+ else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF))
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ else
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ }
+
+ if (tsx_ctrl_state == TSX_CTRL_DISABLE) {
+ tsx_disable();
+
+ /*
+ * tsx_disable() will change the state of the RTM and HLE CPUID
+ * bits. Clear them here since they are now expected to be not
+ * set.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
+ setup_clear_cpu_cap(X86_FEATURE_HLE);
+ } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) {
+
+ /*
+ * HW defaults TSX to be enabled at bootup.
+ * We may still need the TSX enable support
+ * during init for special cases like
+ * kexec after TSX is disabled.
+ */
+ tsx_enable();
+
+ /*
+ * tsx_enable() will change the state of the RTM and HLE CPUID
+ * bits. Force them here since they are now expected to be set.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RTM);
+ setup_force_cpu_cap(X86_FEATURE_HLE);
+ }
+}
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
index 32b4dc9030aa..c222f283b456 100644
--- a/arch/x86/kernel/cpu/umwait.c
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -17,6 +17,12 @@
*/
static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
+u32 get_umwait_control_msr(void)
+{
+ return umwait_control_cached;
+}
+EXPORT_SYMBOL_GPL(get_umwait_control_msr);
+
/*
* Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by
* hardware or BIOS before kernel boot.
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 3c648476d4fb..46d732696c1c 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -30,34 +30,69 @@
#include <asm/hypervisor.h>
#include <asm/timer.h>
#include <asm/apic.h>
+#include <asm/vmware.h>
#undef pr_fmt
#define pr_fmt(fmt) "vmware: " fmt
-#define CPUID_VMWARE_INFO_LEAF 0x40000000
+#define CPUID_VMWARE_INFO_LEAF 0x40000000
+#define CPUID_VMWARE_FEATURES_LEAF 0x40000010
+#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0)
+#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1)
+
#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
-#define VMWARE_HYPERVISOR_PORT 0x5658
-#define VMWARE_PORT_CMD_GETVERSION 10
-#define VMWARE_PORT_CMD_GETHZ 45
-#define VMWARE_PORT_CMD_GETVCPU_INFO 68
-#define VMWARE_PORT_CMD_LEGACY_X2APIC 3
-#define VMWARE_PORT_CMD_VCPU_RESERVED 31
+#define VMWARE_CMD_GETVERSION 10
+#define VMWARE_CMD_GETHZ 45
+#define VMWARE_CMD_GETVCPU_INFO 68
+#define VMWARE_CMD_LEGACY_X2APIC 3
+#define VMWARE_CMD_VCPU_RESERVED 31
#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
- __asm__("inl (%%dx)" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "0"(VMWARE_HYPERVISOR_MAGIC), \
- "1"(VMWARE_PORT_CMD_##cmd), \
- "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
- "memory");
+ __asm__("inl (%%dx), %%eax" : \
+ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
+ "a"(VMWARE_HYPERVISOR_MAGIC), \
+ "c"(VMWARE_CMD_##cmd), \
+ "d"(VMWARE_HYPERVISOR_PORT), "b"(UINT_MAX) : \
+ "memory")
+
+#define VMWARE_VMCALL(cmd, eax, ebx, ecx, edx) \
+ __asm__("vmcall" : \
+ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
+ "a"(VMWARE_HYPERVISOR_MAGIC), \
+ "c"(VMWARE_CMD_##cmd), \
+ "d"(0), "b"(UINT_MAX) : \
+ "memory")
+
+#define VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx) \
+ __asm__("vmmcall" : \
+ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
+ "a"(VMWARE_HYPERVISOR_MAGIC), \
+ "c"(VMWARE_CMD_##cmd), \
+ "d"(0), "b"(UINT_MAX) : \
+ "memory")
+
+#define VMWARE_CMD(cmd, eax, ebx, ecx, edx) do { \
+ switch (vmware_hypercall_mode) { \
+ case CPUID_VMWARE_FEATURES_ECX_VMCALL: \
+ VMWARE_VMCALL(cmd, eax, ebx, ecx, edx); \
+ break; \
+ case CPUID_VMWARE_FEATURES_ECX_VMMCALL: \
+ VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx); \
+ break; \
+ default: \
+ VMWARE_PORT(cmd, eax, ebx, ecx, edx); \
+ break; \
+ } \
+ } while (0)
static unsigned long vmware_tsc_khz __ro_after_init;
+static u8 vmware_hypercall_mode __ro_after_init;
static inline int __vmware_platform(void)
{
uint32_t eax, ebx, ecx, edx;
- VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx);
+ VMWARE_CMD(GETVERSION, eax, ebx, ecx, edx);
return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
}
@@ -129,6 +164,10 @@ static void __init vmware_set_capabilities(void)
{
setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL)
+ setup_force_cpu_cap(X86_FEATURE_VMCALL);
+ else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL)
+ setup_force_cpu_cap(X86_FEATURE_VMW_VMMCALL);
}
static void __init vmware_platform_setup(void)
@@ -136,7 +175,7 @@ static void __init vmware_platform_setup(void)
uint32_t eax, ebx, ecx, edx;
uint64_t lpj, tsc_khz;
- VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+ VMWARE_CMD(GETHZ, eax, ebx, ecx, edx);
if (ebx != UINT_MAX) {
lpj = tsc_khz = eax | (((uint64_t)ebx) << 32);
@@ -174,10 +213,21 @@ static void __init vmware_platform_setup(void)
vmware_set_capabilities();
}
+static u8 vmware_select_hypercall(void)
+{
+ int eax, ebx, ecx, edx;
+
+ cpuid(CPUID_VMWARE_FEATURES_LEAF, &eax, &ebx, &ecx, &edx);
+ return (ecx & (CPUID_VMWARE_FEATURES_ECX_VMMCALL |
+ CPUID_VMWARE_FEATURES_ECX_VMCALL));
+}
+
/*
* While checking the dmi string information, just checking the product
* serial key should be enough, as this will always have a VMware
* specific string when running under VMware hypervisor.
+ * If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode
+ * intentionally defaults to 0.
*/
static uint32_t __init vmware_platform(void)
{
@@ -187,8 +237,16 @@ static uint32_t __init vmware_platform(void)
cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
&hyper_vendor_id[1], &hyper_vendor_id[2]);
- if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
+ if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) {
+ if (eax >= CPUID_VMWARE_FEATURES_LEAF)
+ vmware_hypercall_mode =
+ vmware_select_hypercall();
+
+ pr_info("hypercall mode: 0x%02x\n",
+ (unsigned int) vmware_hypercall_mode);
+
return CPUID_VMWARE_INFO_LEAF;
+ }
} else if (dmi_available && dmi_name_in_serial("VMware") &&
__vmware_platform())
return 1;
@@ -200,9 +258,9 @@ static uint32_t __init vmware_platform(void)
static bool __init vmware_legacy_x2apic_available(void)
{
uint32_t eax, ebx, ecx, edx;
- VMWARE_PORT(GETVCPU_INFO, eax, ebx, ecx, edx);
- return (eax & (1 << VMWARE_PORT_CMD_VCPU_RESERVED)) == 0 &&
- (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0;
+ VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx);
+ return (eax & (1 << VMWARE_CMD_VCPU_RESERVED)) == 0 &&
+ (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0;
}
const __initconst struct hypervisor_x86 x86_hyper_vmware = {
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
index 8e6f2f4b4afe..df1358ba622b 100644
--- a/arch/x86/kernel/cpu/zhaoxin.c
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -16,13 +16,6 @@
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */
-#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
-#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
-#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
-#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
-#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
-#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
-
static void init_zhaoxin_cap(struct cpuinfo_x86 *c)
{
u32 lo, hi;
@@ -58,8 +51,6 @@ static void init_zhaoxin_cap(struct cpuinfo_x86 *c)
if (c->x86 >= 0x6)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
- cpu_detect_cache_sizes(c);
}
static void early_init_zhaoxin(struct cpuinfo_x86 *c)
@@ -89,31 +80,6 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c)
}
-static void zhaoxin_detect_vmx_virtcap(struct cpuinfo_x86 *c)
-{
- u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
-
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
- msr_ctl = vmx_msr_high | vmx_msr_low;
-
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
- set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
- set_cpu_cap(c, X86_FEATURE_VNMI);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx_msr_low, vmx_msr_high);
- msr_ctl2 = vmx_msr_high | vmx_msr_low;
- if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
- (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
- set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
- set_cpu_cap(c, X86_FEATURE_EPT);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
- set_cpu_cap(c, X86_FEATURE_VPID);
- }
-}
-
static void init_zhaoxin(struct cpuinfo_x86 *c)
{
early_init_zhaoxin(c);
@@ -141,8 +107,7 @@ static void init_zhaoxin(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
- if (cpu_has(c, X86_FEATURE_VMX))
- zhaoxin_detect_vmx_virtcap(c);
+ init_ia32_feat_ctl(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 2bf70a2fed90..fd87b59452a3 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -24,6 +24,7 @@
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/memblock.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
@@ -39,6 +40,7 @@
#include <asm/virtext.h>
#include <asm/intel_pt.h>
#include <asm/crash.h>
+#include <asm/cmdline.h>
/* Used while preparing memory map entries for second kernel */
struct crash_memmap_data {
@@ -68,6 +70,19 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void)
rcu_read_unlock();
}
+/*
+ * When the crashkernel option is specified, only use the low
+ * 1M for the real mode trampoline.
+ */
+void __init crash_reserve_low_1M(void)
+{
+ if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0)
+ return;
+
+ memblock_reserve(0, 1<<20);
+ pr_info("Reserving the low 1M of memory for crashkernel\n");
+}
+
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
@@ -173,8 +188,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_KEXEC_FILE
-static unsigned long crash_zero_bytes;
-
static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
{
unsigned int *nr_ranges = arg;
@@ -189,8 +202,7 @@ static struct crash_mem *fill_up_crash_elf_data(void)
unsigned int nr_ranges = 0;
struct crash_mem *cmem;
- walk_system_ram_res(0, -1, &nr_ranges,
- get_nr_ram_ranges_callback);
+ walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback);
if (!nr_ranges)
return NULL;
@@ -217,17 +229,19 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem)
{
int ret = 0;
+ /* Exclude the low 1M because it is always reserved */
+ ret = crash_exclude_mem_range(cmem, 0, 1<<20);
+ if (ret)
+ return ret;
+
/* Exclude crashkernel region */
ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
if (ret)
return ret;
- if (crashk_low_res.end) {
+ if (crashk_low_res.end)
ret = crash_exclude_mem_range(cmem, crashk_low_res.start,
- crashk_low_res.end);
- if (ret)
- return ret;
- }
+ crashk_low_res.end);
return ret;
}
@@ -248,16 +262,13 @@ static int prepare_elf_headers(struct kimage *image, void **addr,
unsigned long *sz)
{
struct crash_mem *cmem;
- Elf64_Ehdr *ehdr;
- Elf64_Phdr *phdr;
- int ret, i;
+ int ret;
cmem = fill_up_crash_elf_data();
if (!cmem)
return -ENOMEM;
- ret = walk_system_ram_res(0, -1, cmem,
- prepare_elf64_ram_headers_callback);
+ ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback);
if (ret)
goto out;
@@ -267,24 +278,8 @@ static int prepare_elf_headers(struct kimage *image, void **addr,
goto out;
/* By default prepare 64bit headers */
- ret = crash_prepare_elf64_headers(cmem,
- IS_ENABLED(CONFIG_X86_64), addr, sz);
- if (ret)
- goto out;
+ ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
- /*
- * If a range matches backup region, adjust offset to backup
- * segment.
- */
- ehdr = (Elf64_Ehdr *)*addr;
- phdr = (Elf64_Phdr *)(ehdr + 1);
- for (i = 0; i < ehdr->e_phnum; phdr++, i++)
- if (phdr->p_type == PT_LOAD &&
- phdr->p_paddr == image->arch.backup_src_start &&
- phdr->p_memsz == image->arch.backup_src_sz) {
- phdr->p_offset = image->arch.backup_load_addr;
- break;
- }
out:
vfree(cmem);
return ret;
@@ -298,8 +293,7 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE)
return 1;
- memcpy(&params->e820_table[nr_e820_entries], entry,
- sizeof(struct e820_entry));
+ memcpy(&params->e820_table[nr_e820_entries], entry, sizeof(struct e820_entry));
params->e820_entries++;
return 0;
}
@@ -323,19 +317,11 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
unsigned long long mend)
{
unsigned long start, end;
- int ret = 0;
cmem->ranges[0].start = mstart;
cmem->ranges[0].end = mend;
cmem->nr_ranges = 1;
- /* Exclude Backup region */
- start = image->arch.backup_load_addr;
- end = start + image->arch.backup_src_sz - 1;
- ret = crash_exclude_mem_range(cmem, start, end);
- if (ret)
- return ret;
-
/* Exclude elf header region */
start = image->arch.elf_load_addr;
end = start + image->arch.elf_headers_sz - 1;
@@ -358,40 +344,39 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
memset(&cmd, 0, sizeof(struct crash_memmap_data));
cmd.params = params;
- /* Add first 640K segment */
- ei.addr = image->arch.backup_src_start;
- ei.size = image->arch.backup_src_sz;
- ei.type = E820_TYPE_RAM;
- add_e820_entry(params, &ei);
+ /* Add the low 1M */
+ cmd.type = E820_TYPE_RAM;
+ flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ walk_iomem_res_desc(IORES_DESC_NONE, flags, 0, (1<<20)-1, &cmd,
+ memmap_entry_callback);
/* Add ACPI tables */
cmd.type = E820_TYPE_ACPI;
flags = IORESOURCE_MEM | IORESOURCE_BUSY;
walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
- memmap_entry_callback);
+ memmap_entry_callback);
/* Add ACPI Non-volatile Storage */
cmd.type = E820_TYPE_NVS;
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
- memmap_entry_callback);
+ memmap_entry_callback);
/* Add e820 reserved ranges */
cmd.type = E820_TYPE_RESERVED;
flags = IORESOURCE_MEM;
walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd,
- memmap_entry_callback);
+ memmap_entry_callback);
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;
- ei.size = crashk_low_res.end - crashk_low_res.start + 1;
+ ei.size = resource_size(&crashk_low_res);
ei.type = E820_TYPE_RAM;
add_e820_entry(params, &ei);
}
/* Exclude some ranges from crashk_res and add rest to memmap */
- ret = memmap_exclude_ranges(image, cmem, crashk_res.start,
- crashk_res.end);
+ ret = memmap_exclude_ranges(image, cmem, crashk_res.start, crashk_res.end);
if (ret)
goto out;
@@ -411,55 +396,12 @@ out:
return ret;
}
-static int determine_backup_region(struct resource *res, void *arg)
-{
- struct kimage *image = arg;
-
- image->arch.backup_src_start = res->start;
- image->arch.backup_src_sz = resource_size(res);
-
- /* Expecting only one range for backup region */
- return 1;
-}
-
int crash_load_segments(struct kimage *image)
{
int ret;
struct kexec_buf kbuf = { .image = image, .buf_min = 0,
.buf_max = ULONG_MAX, .top_down = false };
- /*
- * Determine and load a segment for backup area. First 640K RAM
- * region is backup source
- */
-
- ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END,
- image, determine_backup_region);
-
- /* Zero or postive return values are ok */
- if (ret < 0)
- return ret;
-
- /* Add backup segment. */
- if (image->arch.backup_src_sz) {
- kbuf.buffer = &crash_zero_bytes;
- kbuf.bufsz = sizeof(crash_zero_bytes);
- kbuf.memsz = image->arch.backup_src_sz;
- kbuf.buf_align = PAGE_SIZE;
- /*
- * Ideally there is no source for backup segment. This is
- * copied in purgatory after crash. Just add a zero filled
- * segment for now to make sure checksum logic works fine.
- */
- ret = kexec_add_buffer(&kbuf);
- if (ret)
- return ret;
- image->arch.backup_load_addr = kbuf.mem;
- pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
- image->arch.backup_load_addr,
- image->arch.backup_src_start, kbuf.memsz);
- }
-
/* Prepare elf headers and add a segment */
ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz);
if (ret)
diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/crash_core_32.c
new file mode 100644
index 000000000000..c0159a7bca6d
--- /dev/null
+++ b/arch/x86/kernel/crash_core_32.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/crash_core.h>
+
+#include <asm/pgtable.h>
+#include <asm/setup.h>
+
+void arch_crash_save_vmcoreinfo(void)
+{
+#ifdef CONFIG_NUMA
+ VMCOREINFO_SYMBOL(node_data);
+ VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+#ifdef CONFIG_X86_PAE
+ VMCOREINFO_CONFIG(X86_PAE);
+#endif
+}
diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/crash_core_64.c
new file mode 100644
index 000000000000..845a57eb4eb7
--- /dev/null
+++ b/arch/x86/kernel/crash_core_64.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/crash_core.h>
+
+#include <asm/pgtable.h>
+#include <asm/setup.h>
+
+void arch_crash_save_vmcoreinfo(void)
+{
+ u64 sme_mask = sme_me_mask;
+
+ VMCOREINFO_NUMBER(phys_base);
+ VMCOREINFO_SYMBOL(init_top_pgt);
+ vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
+ pgtable_l5_enabled());
+
+#ifdef CONFIG_NUMA
+ VMCOREINFO_SYMBOL(node_data);
+ VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+ vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
+ VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+ VMCOREINFO_NUMBER(sme_mask);
+}
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 22369dd5de3b..045e82e8945b 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -70,3 +70,8 @@ ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
{
return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
}
+
+ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
+{
+ return read_from_oldmem(buf, count, ppos, 0, sev_active());
+}
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
deleted file mode 100644
index 0b8cedb20d6d..000000000000
--- a/arch/x86/kernel/doublefault.c
+++ /dev/null
@@ -1,83 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/sched/debug.h>
-#include <linux/init_task.h>
-#include <linux/fs.h>
-
-#include <linux/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-
-#ifdef CONFIG_X86_32
-
-#define DOUBLEFAULT_STACKSIZE (1024)
-static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
-#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
-
-#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
-
-static void doublefault_fn(void)
-{
- struct desc_ptr gdt_desc = {0, 0};
- unsigned long gdt, tss;
-
- native_store_gdt(&gdt_desc);
- gdt = gdt_desc.address;
-
- printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
-
- if (ptr_ok(gdt)) {
- gdt += GDT_ENTRY_TSS << 3;
- tss = get_desc_base((struct desc_struct *)gdt);
- printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
-
- if (ptr_ok(tss)) {
- struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
-
- printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
- t->ip, t->sp);
-
- printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
- t->ax, t->bx, t->cx, t->dx);
- printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
- t->si, t->di);
- }
- }
-
- for (;;)
- cpu_relax();
-}
-
-struct x86_hw_tss doublefault_tss __cacheline_aligned = {
- .sp0 = STACK_START,
- .ss0 = __KERNEL_DS,
- .ldt = 0,
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
-
- .ip = (unsigned long) doublefault_fn,
- /* 0x2 bit is always set */
- .flags = X86_EFLAGS_SF | 0x2,
- .sp = STACK_START,
- .es = __USER_DS,
- .cs = __KERNEL_CS,
- .ss = __KERNEL_DS,
- .ds = __USER_DS,
- .fs = __KERNEL_PERCPU,
-
- .__cr3 = __pa_nodebug(swapper_pg_dir),
-};
-
-/* dummy for do_double_fault() call */
-void df_debug(struct pt_regs *regs, long error_code) {}
-
-#else /* !CONFIG_X86_32 */
-
-void df_debug(struct pt_regs *regs, long error_code)
-{
- pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
- show_regs(regs);
- panic("Machine halted.");
-}
-#endif
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
new file mode 100644
index 000000000000..3793646f0fb5
--- /dev/null
+++ b/arch/x86/kernel/doublefault_32.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/traps.h>
+
+extern void double_fault(void);
+#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
+
+#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x)
+
+static void set_df_gdt_entry(unsigned int cpu);
+
+/*
+ * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks
+ * we're running the doublefault task. Cannot return.
+ */
+asmlinkage notrace void __noreturn doublefault_shim(void)
+{
+ unsigned long cr2;
+ struct pt_regs regs;
+
+ BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);
+
+ cr2 = native_read_cr2();
+
+ /* Reset back to the normal kernel task. */
+ force_reload_TR();
+ set_df_gdt_entry(smp_processor_id());
+
+ trace_hardirqs_off();
+
+ /*
+ * Fill in pt_regs. A downside of doing this in C is that the unwinder
+ * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump
+ * won't successfully unwind to the source of the double fault.
+ * The main dump from do_double_fault() is fine, though, since it
+ * uses these regs directly.
+ *
+ * If anyone ever cares, this could be moved to asm.
+ */
+ regs.ss = TSS(ss);
+ regs.__ssh = 0;
+ regs.sp = TSS(sp);
+ regs.flags = TSS(flags);
+ regs.cs = TSS(cs);
+ /* We won't go through the entry asm, so we can leave __csh as 0. */
+ regs.__csh = 0;
+ regs.ip = TSS(ip);
+ regs.orig_ax = 0;
+ regs.gs = TSS(gs);
+ regs.__gsh = 0;
+ regs.fs = TSS(fs);
+ regs.__fsh = 0;
+ regs.es = TSS(es);
+ regs.__esh = 0;
+ regs.ds = TSS(ds);
+ regs.__dsh = 0;
+ regs.ax = TSS(ax);
+ regs.bp = TSS(bp);
+ regs.di = TSS(di);
+ regs.si = TSS(si);
+ regs.dx = TSS(dx);
+ regs.cx = TSS(cx);
+ regs.bx = TSS(bx);
+
+ do_double_fault(&regs, 0, cr2);
+
+ /*
+ * x86_32 does not save the original CR3 anywhere on a task switch.
+ * This means that, even if we wanted to return, we would need to find
+ * some way to reconstruct CR3. We could make a credible guess based
+ * on cpu_tlbstate, but that would be racy and would not account for
+ * PTI.
+ *
+ * Instead, don't bother. We can return through
+ * rewind_stack_do_exit() instead.
+ */
+ panic("cannot return from double fault\n");
+}
+NOKPROBE_SYMBOL(doublefault_shim);
+
+DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
+ .tss = {
+ /*
+ * No sp0 or ss0 -- we never run CPL != 0 with this TSS
+ * active. sp is filled in later.
+ */
+ .ldt = 0,
+ .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
+
+ .ip = (unsigned long) double_fault,
+ .flags = X86_EFLAGS_FIXED,
+ .es = __USER_DS,
+ .cs = __KERNEL_CS,
+ .ss = __KERNEL_DS,
+ .ds = __USER_DS,
+ .fs = __KERNEL_PERCPU,
+#ifndef CONFIG_X86_32_LAZY_GS
+ .gs = __KERNEL_STACK_CANARY,
+#endif
+
+ .__cr3 = __pa_nodebug(swapper_pg_dir),
+ },
+};
+
+static void set_df_gdt_entry(unsigned int cpu)
+{
+ /* Set up doublefault TSS pointer in the GDT */
+ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
+ &get_cpu_entry_area(cpu)->doublefault_stack.tss);
+
+}
+
+void doublefault_init_cpu_tss(void)
+{
+ unsigned int cpu = smp_processor_id();
+ struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
+
+ /*
+ * The linker isn't smart enough to initialize percpu variables that
+ * point to other places in percpu space.
+ */
+ this_cpu_write(doublefault_stack.tss.sp,
+ (unsigned long)&cea->doublefault_stack.stack +
+ sizeof(doublefault_stack.stack));
+
+ set_df_gdt_entry(cpu);
+}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2b5886401e5f..ae64ec7f752f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -365,21 +365,30 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
}
NOKPROBE_SYMBOL(oops_end);
-int __die(const char *str, struct pt_regs *regs, long err)
+static void __die_header(const char *str, struct pt_regs *regs, long err)
{
+ const char *pr = "";
+
/* Save the regs of the first oops for the executive summary later. */
if (!die_counter)
exec_summary_regs = *regs;
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
+
printk(KERN_DEFAULT
"%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
- IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
+ pr,
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
(boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
+}
+NOKPROBE_SYMBOL(__die_header);
+static int __die_body(const char *str, struct pt_regs *regs, long err)
+{
show_regs(regs);
print_modules();
@@ -389,6 +398,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
return 0;
}
+NOKPROBE_SYMBOL(__die_body);
+
+int __die(const char *str, struct pt_regs *regs, long err)
+{
+ __die_header(str, regs, err);
+ return __die_body(str, regs, err);
+}
NOKPROBE_SYMBOL(__die);
/*
@@ -405,6 +421,19 @@ void die(const char *str, struct pt_regs *regs, long err)
oops_end(flags, regs, sig);
}
+void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr)
+{
+ unsigned long flags = oops_begin();
+ int sig = SIGSEGV;
+
+ __die_header(str, regs, err);
+ if (gp_addr)
+ kasan_non_canonical_hook(gp_addr);
+ if (__die_body(str, regs, err))
+ sig = 0;
+ oops_end(flags, regs, sig);
+}
+
void show_regs(struct pt_regs *regs)
{
show_regs_print_info(KERN_DEFAULT);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 64a59d726639..8e3a8fedfa4d 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -29,6 +29,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_ENTRY)
return "ENTRY_TRAMPOLINE";
+ if (type == STACK_TYPE_EXCEPTION)
+ return "#DF";
+
return NULL;
}
@@ -82,6 +85,30 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
return true;
}
+static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info)
+{
+#ifdef CONFIG_DOUBLEFAULT
+ struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id());
+ struct doublefault_stack *ss = &cea->doublefault_stack;
+
+ void *begin = ss->stack;
+ void *end = begin + sizeof(ss->stack);
+
+ if ((void *)stack < begin || (void *)stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_EXCEPTION;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp);
+
+ return true;
+#else
+ return false;
+#endif
+}
+
+
int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask)
{
@@ -105,6 +132,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (in_softirq_stack(stack, info))
goto recursion_check;
+ if (in_doublefault_stack(stack, info))
+ goto recursion_check;
+
goto unknown;
recursion_check:
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 753b8cfe8b8a..87b97897a881 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -94,6 +94,13 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
+ /*
+ * Handle the case where stack trace is collected _before_
+ * cea_exception_stacks had been initialized.
+ */
+ if (!begin)
+ return false;
+
end = begin + sizeof(struct cea_exception_stacks);
/* Bail if @stack is outside the exception stack area. */
if (stk < begin || stk >= end)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7da2bcd2b8eb..c5399e80c59c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -190,6 +190,7 @@ static void __init e820_print_type(enum e820_type type)
case E820_TYPE_RAM: /* Fall through: */
case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break;
case E820_TYPE_RESERVED: pr_cont("reserved"); break;
+ case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break;
case E820_TYPE_ACPI: pr_cont("ACPI data"); break;
case E820_TYPE_NVS: pr_cont("ACPI NVS"); break;
case E820_TYPE_UNUSABLE: pr_cont("unusable"); break;
@@ -999,6 +1000,17 @@ void __init e820__reserve_setup_data(void)
data = early_memremap(pa_data, sizeof(*data));
e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+
+ if (data->type == SETUP_INDIRECT &&
+ ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) {
+ e820__range_update(((struct setup_indirect *)data->data)->addr,
+ ((struct setup_indirect *)data->data)->len,
+ E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+ e820__range_update_kexec(((struct setup_indirect *)data->data)->addr,
+ ((struct setup_indirect *)data->data)->len,
+ E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+ }
+
pa_data = data->next;
early_memunmap(data, sizeof(*data));
}
@@ -1037,6 +1049,7 @@ static const char *__init e820_type_to_string(struct e820_entry *entry)
case E820_TYPE_PRAM: return "Persistent Memory (legacy)";
case E820_TYPE_PMEM: return "Persistent Memory";
case E820_TYPE_RESERVED: return "Reserved";
+ case E820_TYPE_SOFT_RESERVED: return "Soft Reserved";
default: return "Unknown E820 type";
}
}
@@ -1052,6 +1065,7 @@ static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
case E820_TYPE_PRAM: /* Fall-through: */
case E820_TYPE_PMEM: /* Fall-through: */
case E820_TYPE_RESERVED: /* Fall-through: */
+ case E820_TYPE_SOFT_RESERVED: /* Fall-through: */
default: return IORESOURCE_MEM;
}
}
@@ -1064,6 +1078,7 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY;
case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
case E820_TYPE_RESERVED: return IORES_DESC_RESERVED;
+ case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED;
case E820_TYPE_RESERVED_KERN: /* Fall-through: */
case E820_TYPE_RAM: /* Fall-through: */
case E820_TYPE_UNUSABLE: /* Fall-through: */
@@ -1078,11 +1093,12 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res)
return true;
/*
- * Treat persistent memory like device memory, i.e. reserve it
- * for exclusive use of a driver
+ * Treat persistent memory and other special memory ranges like
+ * device memory, i.e. reserve it for exclusive use of a driver
*/
switch (type) {
case E820_TYPE_RESERVED:
+ case E820_TYPE_SOFT_RESERVED:
case E820_TYPE_PRAM:
case E820_TYPE_PMEM:
return false;
@@ -1285,6 +1301,9 @@ void __init e820__memblock_setup(void)
if (end != (resource_size_t)end)
continue;
+ if (entry->type == E820_TYPE_SOFT_RESERVED)
+ memblock_reserve(entry->addr, entry->size);
+
if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
continue;
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 6c4f01540833..2f9ec14be3b1 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -549,6 +549,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_CNL_IDS(&gen9_early_ops),
INTEL_ICL_11_IDS(&gen11_early_ops),
INTEL_EHL_IDS(&gen11_early_ops),
+ INTEL_TGL_12_IDS(&gen11_early_ops),
};
struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0);
@@ -709,6 +710,12 @@ static struct chipset early_qrk[] __initdata = {
*/
{ PCI_VENDOR_ID_INTEL, 0x0f00,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
+ { PCI_VENDOR_ID_INTEL, 0x3e20,
+ PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
+ { PCI_VENDOR_ID_INTEL, 0x3ec4,
+ PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
+ { PCI_VENDOR_ID_INTEL, 0x8a12,
+ PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
{ PCI_VENDOR_ID_BROADCOM, 0x4331,
PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
{}
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 0071b794ed19..400a05e1c1c5 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -352,6 +352,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
fpregs_unlock();
return 0;
}
+ fpregs_deactivate(fpu);
fpregs_unlock();
}
@@ -403,6 +404,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
}
if (!ret)
fpregs_mark_activate();
+ else
+ fpregs_deactivate(fpu);
fpregs_unlock();
err_out:
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index e5cb67d67c03..a1806598aaa4 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -60,7 +60,7 @@ u64 xfeatures_mask __read_mostly;
static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
+static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
/*
* The XSAVE area of kernel can be in standard or compacted format;
@@ -107,23 +107,20 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
}
EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
-static int xfeature_is_supervisor(int xfeature_nr)
+static bool xfeature_is_supervisor(int xfeature_nr)
{
/*
- * We currently do not support supervisor states, but if
- * we did, we could find out like this.
- *
- * SDM says: If state component 'i' is a user state component,
- * ECX[0] return 0; if state component i is a supervisor
- * state component, ECX[0] returns 1.
+ * Extended State Enumeration Sub-leaves (EAX = 0DH, ECX = n, n > 1)
+ * returns ECX[0] set to (1) for a supervisor state, and cleared (0)
+ * for a user state.
*/
u32 eax, ebx, ecx, edx;
cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
- return !!(ecx & 1);
+ return ecx & 1;
}
-static int xfeature_is_user(int xfeature_nr)
+static bool xfeature_is_user(int xfeature_nr)
{
return !xfeature_is_supervisor(xfeature_nr);
}
@@ -254,10 +251,13 @@ static void __init setup_xstate_features(void)
* in the fixed offsets in the xsave area in either compacted form
* or standard form.
*/
- xstate_offsets[0] = 0;
- xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space);
- xstate_offsets[1] = xstate_sizes[0];
- xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space);
+ xstate_offsets[XFEATURE_FP] = 0;
+ xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state,
+ xmm_space);
+
+ xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP];
+ xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state,
+ xmm_space);
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
if (!xfeature_enabled(i))
@@ -342,7 +342,7 @@ static int xfeature_is_aligned(int xfeature_nr)
*/
static void __init setup_xstate_comp(void)
{
- unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
+ unsigned int xstate_comp_sizes[XFEATURE_MAX];
int i;
/*
@@ -350,8 +350,9 @@ static void __init setup_xstate_comp(void)
* in the fixed offsets in the xsave area in either compacted form
* or standard form.
*/
- xstate_comp_offsets[0] = 0;
- xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
+ xstate_comp_offsets[XFEATURE_FP] = 0;
+ xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state,
+ xmm_space);
if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
@@ -415,7 +416,8 @@ static void __init setup_init_fpu_buf(void)
print_xstate_features();
if (boot_cpu_has(X86_FEATURE_XSAVES))
- init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
+ init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
+ xfeatures_mask;
/*
* Init all the features state with header.xfeatures being 0x0
@@ -840,7 +842,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
/*
* We should not ever be requesting features that we
- * have not enabled. Remember that pcntxt_mask is
+ * have not enabled. Remember that xfeatures_mask is
* what we write to the XCR0 register.
*/
WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)),
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 024c3053dbba..37a0aeaf89e7 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -23,6 +23,7 @@
#include <linux/list.h>
#include <linux/module.h>
#include <linux/memory.h>
+#include <linux/vmalloc.h>
#include <trace/syscall.h>
@@ -34,6 +35,8 @@
#ifdef CONFIG_DYNAMIC_FTRACE
+static int ftrace_poke_late = 0;
+
int ftrace_arch_code_modify_prepare(void)
__acquires(&text_mutex)
{
@@ -43,84 +46,37 @@ int ftrace_arch_code_modify_prepare(void)
* ftrace has it set to "read/write".
*/
mutex_lock(&text_mutex);
- set_kernel_text_rw();
- set_all_modules_text_rw();
+ ftrace_poke_late = 1;
return 0;
}
int ftrace_arch_code_modify_post_process(void)
__releases(&text_mutex)
{
- set_all_modules_text_ro();
- set_kernel_text_ro();
+ /*
+ * ftrace_make_{call,nop}() may be called during
+ * module load, and we need to finish the text_poke_queue()
+ * that they do, here.
+ */
+ text_poke_finish();
+ ftrace_poke_late = 0;
mutex_unlock(&text_mutex);
return 0;
}
-union ftrace_code_union {
- char code[MCOUNT_INSN_SIZE];
- struct {
- unsigned char op;
- int offset;
- } __attribute__((packed));
-};
-
-static int ftrace_calc_offset(long ip, long addr)
-{
- return (int)(addr - ip);
-}
-
-static unsigned char *
-ftrace_text_replace(unsigned char op, unsigned long ip, unsigned long addr)
+static const char *ftrace_nop_replace(void)
{
- static union ftrace_code_union calc;
-
- calc.op = op;
- calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
-
- return calc.code;
-}
-
-static unsigned char *
-ftrace_call_replace(unsigned long ip, unsigned long addr)
-{
- return ftrace_text_replace(0xe8, ip, addr);
-}
-
-static inline int
-within(unsigned long addr, unsigned long start, unsigned long end)
-{
- return addr >= start && addr < end;
+ return ideal_nops[NOP_ATOMIC5];
}
-static unsigned long text_ip_addr(unsigned long ip)
+static const char *ftrace_call_replace(unsigned long ip, unsigned long addr)
{
- /*
- * On x86_64, kernel text mappings are mapped read-only, so we use
- * the kernel identity mapping instead of the kernel text mapping
- * to modify the kernel text.
- *
- * For 32bit kernels, these mappings are same and we can use
- * kernel identity mapping to modify code.
- */
- if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa_symbol(ip));
-
- return ip;
+ return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr);
}
-static const unsigned char *ftrace_nop_replace(void)
+static int ftrace_verify_code(unsigned long ip, const char *old_code)
{
- return ideal_nops[NOP_ATOMIC5];
-}
-
-static int
-ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code)
-{
- unsigned char replaced[MCOUNT_INSN_SIZE];
-
- ftrace_expected = old_code;
+ char cur_code[MCOUNT_INSN_SIZE];
/*
* Note:
@@ -129,31 +85,46 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
* Carefully read and modify the code with probe_kernel_*(), and make
* sure what we read is what we expected it to be before modifying it.
*/
-
/* read the text we want to modify */
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+ if (probe_kernel_read(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) {
+ WARN_ON(1);
return -EFAULT;
+ }
/* Make sure it is what we expect it to be */
- if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+ if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) {
+ WARN_ON(1);
return -EINVAL;
+ }
- ip = text_ip_addr(ip);
-
- /* replace the text with the new text */
- if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
- return -EPERM;
+ return 0;
+}
- sync_core();
+/*
+ * Marked __ref because it calls text_poke_early() which is .init.text. That is
+ * ok because that call will happen early, during boot, when .init sections are
+ * still present.
+ */
+static int __ref
+ftrace_modify_code_direct(unsigned long ip, const char *old_code,
+ const char *new_code)
+{
+ int ret = ftrace_verify_code(ip, old_code);
+ if (ret)
+ return ret;
+ /* replace the text with the new text */
+ if (ftrace_poke_late)
+ text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
+ else
+ text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
return 0;
}
-int ftrace_make_nop(struct module *mod,
- struct dyn_ftrace *rec, unsigned long addr)
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned const char *new, *old;
unsigned long ip = rec->ip;
+ const char *new, *old;
old = ftrace_call_replace(ip, addr);
new = ftrace_nop_replace();
@@ -167,19 +138,20 @@ int ftrace_make_nop(struct module *mod,
* just modify the code directly.
*/
if (addr == MCOUNT_ADDR)
- return ftrace_modify_code_direct(rec->ip, old, new);
+ return ftrace_modify_code_direct(ip, old, new);
- ftrace_expected = NULL;
-
- /* Normal cases use add_brk_on_nop */
+ /*
+ * x86 overrides ftrace_replace_code -- this function will never be used
+ * in this case.
+ */
WARN_ONCE(1, "invalid use of ftrace_make_nop");
return -EINVAL;
}
int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned const char *new, *old;
unsigned long ip = rec->ip;
+ const char *new, *old;
old = ftrace_nop_replace();
new = ftrace_call_replace(ip, addr);
@@ -189,43 +161,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
}
/*
- * The modifying_ftrace_code is used to tell the breakpoint
- * handler to call ftrace_int3_handler(). If it fails to
- * call this handler for a breakpoint added by ftrace, then
- * the kernel may crash.
- *
- * As atomic_writes on x86 do not need a barrier, we do not
- * need to add smp_mb()s for this to work. It is also considered
- * that we can not read the modifying_ftrace_code before
- * executing the breakpoint. That would be quite remarkable if
- * it could do that. Here's the flow that is required:
- *
- * CPU-0 CPU-1
- *
- * atomic_inc(mfc);
- * write int3s
- * <trap-int3> // implicit (r)mb
- * if (atomic_read(mfc))
- * call ftrace_int3_handler()
- *
- * Then when we are finished:
- *
- * atomic_dec(mfc);
- *
- * If we hit a breakpoint that was not set by ftrace, it does not
- * matter if ftrace_int3_handler() is called or not. It will
- * simply be ignored. But it is crucial that a ftrace nop/caller
- * breakpoint is handled. No other user should ever place a
- * breakpoint on an ftrace nop/caller location. It must only
- * be done by this code.
- */
-atomic_t modifying_ftrace_code __read_mostly;
-
-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code);
-
-/*
* Should never be called:
* As it is only called by __ftrace_replace_code() which is called by
* ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
@@ -237,452 +172,84 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
unsigned long addr)
{
WARN_ON(1);
- ftrace_expected = NULL;
return -EINVAL;
}
-static unsigned long ftrace_update_func;
-static unsigned long ftrace_update_func_call;
-
-static int update_ftrace_func(unsigned long ip, void *new)
-{
- unsigned char old[MCOUNT_INSN_SIZE];
- int ret;
-
- memcpy(old, (void *)ip, MCOUNT_INSN_SIZE);
-
- ftrace_update_func = ip;
- /* Make sure the breakpoints see the ftrace_update_func update */
- smp_wmb();
-
- /* See comment above by declaration of modifying_ftrace_code */
- atomic_inc(&modifying_ftrace_code);
-
- ret = ftrace_modify_code(ip, old, new);
-
- atomic_dec(&modifying_ftrace_code);
-
- return ret;
-}
-
int ftrace_update_ftrace_func(ftrace_func_t func)
{
- unsigned long ip = (unsigned long)(&ftrace_call);
- unsigned char *new;
- int ret;
-
- ftrace_update_func_call = (unsigned long)func;
-
- new = ftrace_call_replace(ip, (unsigned long)func);
- ret = update_ftrace_func(ip, new);
-
- /* Also update the regs callback function */
- if (!ret) {
- ip = (unsigned long)(&ftrace_regs_call);
- new = ftrace_call_replace(ip, (unsigned long)func);
- ret = update_ftrace_func(ip, new);
- }
-
- return ret;
-}
-
-static nokprobe_inline int is_ftrace_caller(unsigned long ip)
-{
- if (ip == ftrace_update_func)
- return 1;
-
- return 0;
-}
-
-/*
- * A breakpoint was added to the code address we are about to
- * modify, and this is the handle that will just skip over it.
- * We are either changing a nop into a trace call, or a trace
- * call to a nop. While the change is taking place, we treat
- * it just like it was a nop.
- */
-int ftrace_int3_handler(struct pt_regs *regs)
-{
unsigned long ip;
+ const char *new;
- if (WARN_ON_ONCE(!regs))
- return 0;
-
- ip = regs->ip - INT3_INSN_SIZE;
-
- if (ftrace_location(ip)) {
- int3_emulate_call(regs, (unsigned long)ftrace_regs_caller);
- return 1;
- } else if (is_ftrace_caller(ip)) {
- if (!ftrace_update_func_call) {
- int3_emulate_jmp(regs, ip + CALL_INSN_SIZE);
- return 1;
- }
- int3_emulate_call(regs, ftrace_update_func_call);
- return 1;
- }
-
- return 0;
-}
-NOKPROBE_SYMBOL(ftrace_int3_handler);
-
-static int ftrace_write(unsigned long ip, const char *val, int size)
-{
- ip = text_ip_addr(ip);
-
- if (probe_kernel_write((void *)ip, val, size))
- return -EPERM;
-
- return 0;
-}
-
-static int add_break(unsigned long ip, const char *old)
-{
- unsigned char replaced[MCOUNT_INSN_SIZE];
- unsigned char brk = BREAKPOINT_INSTRUCTION;
-
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- ftrace_expected = old;
-
- /* Make sure it is what we expect it to be */
- if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
- return -EINVAL;
-
- return ftrace_write(ip, &brk, 1);
-}
-
-static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned const char *old;
- unsigned long ip = rec->ip;
-
- old = ftrace_call_replace(ip, addr);
-
- return add_break(rec->ip, old);
-}
-
-
-static int add_brk_on_nop(struct dyn_ftrace *rec)
-{
- unsigned const char *old;
-
- old = ftrace_nop_replace();
-
- return add_break(rec->ip, old);
-}
-
-static int add_breakpoints(struct dyn_ftrace *rec, bool enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ftrace_addr = ftrace_get_addr_curr(rec);
-
- ret = ftrace_test_record(rec, enable);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return add_brk_on_nop(rec);
-
- case FTRACE_UPDATE_MODIFY_CALL:
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return add_brk_on_call(rec, ftrace_addr);
- }
- return 0;
-}
-
-/*
- * On error, we need to remove breakpoints. This needs to
- * be done caefully. If the address does not currently have a
- * breakpoint, we know we are done. Otherwise, we look at the
- * remaining 4 bytes of the instruction. If it matches a nop
- * we replace the breakpoint with the nop. Otherwise we replace
- * it with the call instruction.
- */
-static int remove_breakpoint(struct dyn_ftrace *rec)
-{
- unsigned char ins[MCOUNT_INSN_SIZE];
- unsigned char brk = BREAKPOINT_INSTRUCTION;
- const unsigned char *nop;
- unsigned long ftrace_addr;
- unsigned long ip = rec->ip;
-
- /* If we fail the read, just give up */
- if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- /* If this does not have a breakpoint, we are done */
- if (ins[0] != brk)
- return 0;
-
- nop = ftrace_nop_replace();
-
- /*
- * If the last 4 bytes of the instruction do not match
- * a nop, then we assume that this is a call to ftrace_addr.
- */
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
- /*
- * For extra paranoidism, we check if the breakpoint is on
- * a call that would actually jump to the ftrace_addr.
- * If not, don't touch the breakpoint, we make just create
- * a disaster.
- */
- ftrace_addr = ftrace_get_addr_new(rec);
- nop = ftrace_call_replace(ip, ftrace_addr);
-
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
- goto update;
-
- /* Check both ftrace_addr and ftrace_old_addr */
- ftrace_addr = ftrace_get_addr_curr(rec);
- nop = ftrace_call_replace(ip, ftrace_addr);
-
- ftrace_expected = nop;
-
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
- return -EINVAL;
- }
-
- update:
- return ftrace_write(ip, nop, 1);
-}
-
-static int add_update_code(unsigned long ip, unsigned const char *new)
-{
- /* skip breakpoint */
- ip++;
- new++;
- return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1);
-}
-
-static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_call_replace(ip, addr);
- return add_update_code(ip, new);
-}
-
-static int add_update_nop(struct dyn_ftrace *rec)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_nop_replace();
- return add_update_code(ip, new);
-}
-
-static int add_update(struct dyn_ftrace *rec, bool enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ret = ftrace_test_record(rec, enable);
-
- ftrace_addr = ftrace_get_addr_new(rec);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-
- case FTRACE_UPDATE_MODIFY_CALL:
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return add_update_call(rec, ftrace_addr);
-
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return add_update_nop(rec);
- }
-
- return 0;
-}
-
-static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_call_replace(ip, addr);
-
- return ftrace_write(ip, new, 1);
-}
-
-static int finish_update_nop(struct dyn_ftrace *rec)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_nop_replace();
-
- return ftrace_write(ip, new, 1);
-}
-
-static int finish_update(struct dyn_ftrace *rec, bool enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ret = ftrace_update_record(rec, enable);
-
- ftrace_addr = ftrace_get_addr_new(rec);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-
- case FTRACE_UPDATE_MODIFY_CALL:
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return finish_update_call(rec, ftrace_addr);
+ ip = (unsigned long)(&ftrace_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return finish_update_nop(rec);
- }
+ ip = (unsigned long)(&ftrace_regs_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
return 0;
}
-static void do_sync_core(void *data)
-{
- sync_core();
-}
-
-static void run_sync(void)
-{
- int enable_irqs;
-
- /* No need to sync if there's only one CPU */
- if (num_online_cpus() == 1)
- return;
-
- enable_irqs = irqs_disabled();
-
- /* We may be called with interrupts disabled (on bootup). */
- if (enable_irqs)
- local_irq_enable();
- on_each_cpu(do_sync_core, NULL, 1);
- if (enable_irqs)
- local_irq_disable();
-}
-
void ftrace_replace_code(int enable)
{
struct ftrace_rec_iter *iter;
struct dyn_ftrace *rec;
- const char *report = "adding breakpoints";
- int count = 0;
+ const char *new, *old;
int ret;
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
- ret = add_breakpoints(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
- }
-
- run_sync();
+ switch (ftrace_test_record(rec, enable)) {
+ case FTRACE_UPDATE_IGNORE:
+ default:
+ continue;
- report = "updating code";
- count = 0;
+ case FTRACE_UPDATE_MAKE_CALL:
+ old = ftrace_nop_replace();
+ break;
- for_ftrace_rec_iter(iter) {
- rec = ftrace_rec_iter_record(iter);
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_NOP:
+ old = ftrace_call_replace(rec->ip, ftrace_get_addr_curr(rec));
+ break;
+ }
- ret = add_update(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
+ ret = ftrace_verify_code(rec->ip, old);
+ if (ret) {
+ ftrace_bug(ret, rec);
+ return;
+ }
}
- run_sync();
-
- report = "removing breakpoints";
- count = 0;
-
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
- ret = finish_update(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
- }
+ switch (ftrace_test_record(rec, enable)) {
+ case FTRACE_UPDATE_IGNORE:
+ default:
+ continue;
- run_sync();
+ case FTRACE_UPDATE_MAKE_CALL:
+ case FTRACE_UPDATE_MODIFY_CALL:
+ new = ftrace_call_replace(rec->ip, ftrace_get_addr_new(rec));
+ break;
- return;
+ case FTRACE_UPDATE_MAKE_NOP:
+ new = ftrace_nop_replace();
+ break;
+ }
- remove_breakpoints:
- pr_warn("Failed on %s (%d):\n", report, count);
- ftrace_bug(ret, rec);
- for_ftrace_rec_iter(iter) {
- rec = ftrace_rec_iter_record(iter);
- /*
- * Breakpoints are handled only when this function is in
- * progress. The system could not work with them.
- */
- if (remove_breakpoint(rec))
- BUG();
+ text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL);
+ ftrace_update_record(rec, enable);
}
- run_sync();
-}
-
-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code)
-{
- int ret;
-
- ret = add_break(ip, old_code);
- if (ret)
- goto out;
-
- run_sync();
-
- ret = add_update_code(ip, new_code);
- if (ret)
- goto fail_update;
-
- run_sync();
-
- ret = ftrace_write(ip, new_code, 1);
- /*
- * The breakpoint is handled only when this function is in progress.
- * The system could not work if we could not remove it.
- */
- BUG_ON(ret);
- out:
- run_sync();
- return ret;
-
- fail_update:
- /* Also here the system could not work with the breakpoint */
- if (ftrace_write(ip, old_code, 1))
- BUG();
- goto out;
+ text_poke_finish();
}
void arch_ftrace_update_code(int command)
{
- /* See comment above by declaration of modifying_ftrace_code */
- atomic_inc(&modifying_ftrace_code);
-
ftrace_modify_all_code(command);
-
- atomic_dec(&modifying_ftrace_code);
}
int __init ftrace_dyn_arch_init(void)
@@ -747,6 +314,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned long start_offset;
unsigned long end_offset;
unsigned long op_offset;
+ unsigned long call_offset;
unsigned long offset;
unsigned long npages;
unsigned long size;
@@ -763,10 +331,12 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
start_offset = (unsigned long)ftrace_regs_caller;
end_offset = (unsigned long)ftrace_regs_caller_end;
op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
+ call_offset = (unsigned long)ftrace_regs_call;
} else {
start_offset = (unsigned long)ftrace_caller;
end_offset = (unsigned long)ftrace_epilogue;
op_offset = (unsigned long)ftrace_caller_op_ptr;
+ call_offset = (unsigned long)ftrace_call;
}
size = end_offset - start_offset;
@@ -823,16 +393,21 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* put in the new offset to the ftrace_ops */
memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
+ /* put in the call to the function */
+ mutex_lock(&text_mutex);
+ call_offset -= start_offset;
+ memcpy(trampoline + call_offset,
+ text_gen_insn(CALL_INSN_OPCODE,
+ trampoline + call_offset,
+ ftrace_ops_get_func(ops)), CALL_INSN_SIZE);
+ mutex_unlock(&text_mutex);
+
/* ALLOC_TRAMP flags lets us know we created it */
ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
set_vm_flush_reset_perms(trampoline);
- /*
- * Module allocation needs to be completed by making the page
- * executable. The page is still writable, which is a security hazard,
- * but anyhow ftrace breaks W^X completely.
- */
+ set_memory_ro((unsigned long)trampoline, npages);
set_memory_x((unsigned long)trampoline, npages);
return (unsigned long)trampoline;
fail:
@@ -859,62 +434,54 @@ static unsigned long calc_trampoline_call_offset(bool save_regs)
void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
{
ftrace_func_t func;
- unsigned char *new;
unsigned long offset;
unsigned long ip;
unsigned int size;
- int ret, npages;
+ const char *new;
- if (ops->trampoline) {
- /*
- * The ftrace_ops caller may set up its own trampoline.
- * In such a case, this code must not modify it.
- */
- if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
- return;
- npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
- set_memory_rw(ops->trampoline, npages);
- } else {
+ if (!ops->trampoline) {
ops->trampoline = create_trampoline(ops, &size);
if (!ops->trampoline)
return;
ops->trampoline_size = size;
- npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ return;
}
+ /*
+ * The ftrace_ops caller may set up its own trampoline.
+ * In such a case, this code must not modify it.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return;
+
offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
ip = ops->trampoline + offset;
-
func = ftrace_ops_get_func(ops);
- ftrace_update_func_call = (unsigned long)func;
-
+ mutex_lock(&text_mutex);
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
- ret = update_ftrace_func(ip, new);
- set_memory_ro(ops->trampoline, npages);
-
- /* The update should never fail */
- WARN_ON(ret);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ mutex_unlock(&text_mutex);
}
/* Return the address of the function the trampoline calls */
static void *addr_from_call(void *ptr)
{
- union ftrace_code_union calc;
+ union text_poke_insn call;
int ret;
- ret = probe_kernel_read(&calc, ptr, MCOUNT_INSN_SIZE);
+ ret = probe_kernel_read(&call, ptr, CALL_INSN_SIZE);
if (WARN_ON_ONCE(ret < 0))
return NULL;
/* Make sure this is a call */
- if (WARN_ON_ONCE(calc.op != 0xe8)) {
- pr_warn("Expected e8, got %x\n", calc.op);
+ if (WARN_ON_ONCE(call.opcode != CALL_INSN_OPCODE)) {
+ pr_warn("Expected E8, got %x\n", call.opcode);
return NULL;
}
- return ptr + MCOUNT_INSN_SIZE + calc.offset;
+ return ptr + CALL_INSN_SIZE + call.disp;
}
void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
@@ -981,19 +548,18 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
#ifdef CONFIG_DYNAMIC_FTRACE
extern void ftrace_graph_call(void);
-static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
+static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
{
- return ftrace_text_replace(0xe9, ip, addr);
+ return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr);
}
static int ftrace_mod_jmp(unsigned long ip, void *func)
{
- unsigned char *new;
+ const char *new;
- ftrace_update_func_call = 0UL;
new = ftrace_jmp_replace(ip, (unsigned long)func);
-
- return update_ftrace_func(ip, new);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ return 0;
}
int ftrace_enable_ftrace_graph_caller(void)
@@ -1019,10 +585,9 @@ int ftrace_disable_ftrace_graph_caller(void)
void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
unsigned long frame_pointer)
{
+ unsigned long return_hooker = (unsigned long)&return_to_handler;
unsigned long old;
int faulted;
- unsigned long return_hooker = (unsigned long)
- &return_to_handler;
/*
* When resuming from suspend-to-ram, this function can be indirectly
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
index 073aab525d80..e8a9f8370112 100644
--- a/arch/x86/kernel/ftrace_32.S
+++ b/arch/x86/kernel/ftrace_32.S
@@ -12,20 +12,18 @@
#include <asm/frame.h>
#include <asm/asm-offsets.h>
-# define function_hook __fentry__
-EXPORT_SYMBOL(__fentry__)
-
#ifdef CONFIG_FRAME_POINTER
# define MCOUNT_FRAME 1 /* using frame = true */
#else
# define MCOUNT_FRAME 0 /* using frame = false */
#endif
-ENTRY(function_hook)
+SYM_FUNC_START(__fentry__)
ret
-END(function_hook)
+SYM_FUNC_END(__fentry__)
+EXPORT_SYMBOL(__fentry__)
-ENTRY(ftrace_caller)
+SYM_CODE_START(ftrace_caller)
#ifdef CONFIG_FRAME_POINTER
/*
@@ -85,11 +83,11 @@ ftrace_graph_call:
#endif
/* This is weak to keep gas from relaxing the jumps */
-WEAK(ftrace_stub)
+SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
ret
-END(ftrace_caller)
+SYM_CODE_END(ftrace_caller)
-ENTRY(ftrace_regs_caller)
+SYM_CODE_START(ftrace_regs_caller)
/*
* We're here from an mcount/fentry CALL, and the stack frame looks like:
*
@@ -138,7 +136,7 @@ ENTRY(ftrace_regs_caller)
movl function_trace_op, %ecx # 3rd argument: ftrace_pos
pushl %esp # 4th argument: pt_regs
-GLOBAL(ftrace_regs_call)
+SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
call ftrace_stub
addl $4, %esp # skip 4th argument
@@ -163,9 +161,10 @@ GLOBAL(ftrace_regs_call)
popl %eax
jmp .Lftrace_ret
+SYM_CODE_END(ftrace_regs_caller)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
+SYM_CODE_START(ftrace_graph_caller)
pushl %eax
pushl %ecx
pushl %edx
@@ -179,7 +178,7 @@ ENTRY(ftrace_graph_caller)
popl %ecx
popl %eax
ret
-END(ftrace_graph_caller)
+SYM_CODE_END(ftrace_graph_caller)
.globl return_to_handler
return_to_handler:
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 809d54397dba..369e61faacfe 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -14,9 +14,6 @@
.code64
.section .entry.text, "ax"
-# define function_hook __fentry__
-EXPORT_SYMBOL(__fentry__)
-
#ifdef CONFIG_FRAME_POINTER
/* Save parent and function stack frames (rip and rbp) */
# define MCOUNT_FRAME_SIZE (8+16*2)
@@ -88,6 +85,7 @@ EXPORT_SYMBOL(__fentry__)
movq %rdi, RDI(%rsp)
movq %r8, R8(%rsp)
movq %r9, R9(%rsp)
+ movq $0, ORIG_RAX(%rsp)
/*
* Save the original RBP. Even though the mcount ABI does not
* require this, it helps out callers.
@@ -114,7 +112,11 @@ EXPORT_SYMBOL(__fentry__)
subq $MCOUNT_INSN_SIZE, %rdi
.endm
-.macro restore_mcount_regs
+.macro restore_mcount_regs save=0
+
+ /* ftrace_regs_caller or frame pointers require this */
+ movq RBP(%rsp), %rbp
+
movq R9(%rsp), %r9
movq R8(%rsp), %r8
movq RDI(%rsp), %rdi
@@ -123,31 +125,29 @@ EXPORT_SYMBOL(__fentry__)
movq RCX(%rsp), %rcx
movq RAX(%rsp), %rax
- /* ftrace_regs_caller can modify %rbp */
- movq RBP(%rsp), %rbp
-
- addq $MCOUNT_REG_SIZE, %rsp
+ addq $MCOUNT_REG_SIZE-\save, %rsp
.endm
#ifdef CONFIG_DYNAMIC_FTRACE
-ENTRY(function_hook)
+SYM_FUNC_START(__fentry__)
retq
-ENDPROC(function_hook)
+SYM_FUNC_END(__fentry__)
+EXPORT_SYMBOL(__fentry__)
-ENTRY(ftrace_caller)
+SYM_FUNC_START(ftrace_caller)
/* save_mcount_regs fills in first two parameters */
save_mcount_regs
-GLOBAL(ftrace_caller_op_ptr)
+SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
/* Load the ftrace_ops into the 3rd parameter */
movq function_trace_op(%rip), %rdx
/* regs go into 4th parameter (but make it NULL) */
movq $0, %rcx
-GLOBAL(ftrace_call)
+SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
call ftrace_stub
restore_mcount_regs
@@ -157,10 +157,10 @@ GLOBAL(ftrace_call)
* think twice before adding any new code or changing the
* layout here.
*/
-GLOBAL(ftrace_epilogue)
+SYM_INNER_LABEL(ftrace_epilogue, SYM_L_GLOBAL)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-GLOBAL(ftrace_graph_call)
+SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL)
jmp ftrace_stub
#endif
@@ -168,19 +168,21 @@ GLOBAL(ftrace_graph_call)
* This is weak to keep gas from relaxing the jumps.
* It is also used to copy the retq for trampolines.
*/
-WEAK(ftrace_stub)
+SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
retq
-ENDPROC(ftrace_caller)
+SYM_FUNC_END(ftrace_caller)
-ENTRY(ftrace_regs_caller)
+SYM_FUNC_START(ftrace_regs_caller)
/* Save the current flags before any operations that can change them */
pushfq
+ UNWIND_HINT_SAVE
+
/* added 8 bytes to save flags */
save_mcount_regs 8
/* save_mcount_regs fills in first two parameters */
-GLOBAL(ftrace_regs_caller_op_ptr)
+SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL)
/* Load the ftrace_ops into the 3rd parameter */
movq function_trace_op(%rip), %rdx
@@ -209,7 +211,7 @@ GLOBAL(ftrace_regs_caller_op_ptr)
/* regs go into 4th parameter */
leaq (%rsp), %rcx
-GLOBAL(ftrace_regs_call)
+SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
call ftrace_stub
/* Copy flags back to SS, to restore them */
@@ -228,7 +230,33 @@ GLOBAL(ftrace_regs_call)
movq R10(%rsp), %r10
movq RBX(%rsp), %rbx
- restore_mcount_regs
+ movq ORIG_RAX(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE-8(%rsp)
+
+ /* If ORIG_RAX is anything but zero, make this a call to that */
+ movq ORIG_RAX(%rsp), %rax
+ cmpq $0, %rax
+ je 1f
+
+ /* Swap the flags with orig_rax */
+ movq MCOUNT_REG_SIZE(%rsp), %rdi
+ movq %rdi, MCOUNT_REG_SIZE-8(%rsp)
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
+
+ restore_mcount_regs 8
+
+ jmp 2f
+
+1: restore_mcount_regs
+
+
+2:
+ /*
+ * The stack layout is nondetermistic here, depending on which path was
+ * taken. This confuses objtool and ORC, rightfully so. For now,
+ * pretend the stack always looks like the non-direct case.
+ */
+ UNWIND_HINT_RESTORE
/* Restore flags */
popfq
@@ -239,16 +267,16 @@ GLOBAL(ftrace_regs_call)
* The trampoline will add the code to jump
* to the return.
*/
-GLOBAL(ftrace_regs_caller_end)
+SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
jmp ftrace_epilogue
-ENDPROC(ftrace_regs_caller)
+SYM_FUNC_END(ftrace_regs_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
-ENTRY(function_hook)
+SYM_FUNC_START(__fentry__)
cmpq $ftrace_stub, ftrace_trace_function
jnz trace
@@ -261,7 +289,7 @@ fgraph_trace:
jnz ftrace_graph_caller
#endif
-GLOBAL(ftrace_stub)
+SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL)
retq
trace:
@@ -279,11 +307,12 @@ trace:
restore_mcount_regs
jmp fgraph_trace
-ENDPROC(function_hook)
+SYM_FUNC_END(__fentry__)
+EXPORT_SYMBOL(__fentry__)
#endif /* CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
+SYM_FUNC_START(ftrace_graph_caller)
/* Saves rbp into %rdx and fills first parameter */
save_mcount_regs
@@ -294,9 +323,9 @@ ENTRY(ftrace_graph_caller)
restore_mcount_regs
retq
-ENDPROC(ftrace_graph_caller)
+SYM_FUNC_END(ftrace_graph_caller)
-ENTRY(return_to_handler)
+SYM_CODE_START(return_to_handler)
UNWIND_HINT_EMPTY
subq $24, %rsp
@@ -312,5 +341,5 @@ ENTRY(return_to_handler)
movq (%rsp), %rax
addq $24, %rsp
JMP_NOSPEC %rdi
-END(return_to_handler)
+SYM_CODE_END(return_to_handler)
#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 29ffa495bd1c..206a4b6144c2 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -222,13 +222,31 @@ unsigned long __head __startup_64(unsigned long physaddr,
* we might write invalid pmds, when the kernel is relocated
* cleanup_highmap() fixes this up along with the mappings
* beyond _end.
+ *
+ * Only the region occupied by the kernel image has so far
+ * been checked against the table of usable memory regions
+ * provided by the firmware, so invalidate pages outside that
+ * region. A page table entry that maps to a reserved area of
+ * memory would allow processor speculation into that area,
+ * and on some hardware (particularly the UV platform) even
+ * speculative access to some reserved areas is caught as an
+ * error, causing the BIOS to halt the system.
*/
pmd = fixup_pointer(level2_kernel_pgt, physaddr);
- for (i = 0; i < PTRS_PER_PMD; i++) {
+
+ /* invalidate pages before the kernel image */
+ for (i = 0; i < pmd_index((unsigned long)_text); i++)
+ pmd[i] &= ~_PAGE_PRESENT;
+
+ /* fixup pages that are part of the kernel image */
+ for (; i <= pmd_index((unsigned long)_end); i++)
if (pmd[i] & _PAGE_PRESENT)
pmd[i] += load_delta;
- }
+
+ /* invalidate pages after the kernel image */
+ for (; i < PTRS_PER_PMD; i++)
+ pmd[i] &= ~_PAGE_PRESENT;
/*
* Fixup phys_base - remove the memory encryption mask to obtain
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 30f9cb2c0b55..3923ab4630d7 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -64,7 +64,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
* can.
*/
__HEAD
-ENTRY(startup_32)
+SYM_CODE_START(startup_32)
movl pa(initial_stack),%ecx
/* test KEEP_SEGMENTS flag to see if the bootloader is asking
@@ -156,7 +156,7 @@ ENTRY(startup_32)
jmp *%eax
.Lbad_subarch:
-WEAK(xen_entry)
+SYM_INNER_LABEL_ALIGN(xen_entry, SYM_L_WEAK)
/* Unknown implementation; there's really
nothing we can do at this point. */
ud2a
@@ -172,6 +172,7 @@ num_subarch_entries = (. - subarch_entries) / 4
#else
jmp .Ldefault_entry
#endif /* CONFIG_PARAVIRT */
+SYM_CODE_END(startup_32)
#ifdef CONFIG_HOTPLUG_CPU
/*
@@ -179,12 +180,12 @@ num_subarch_entries = (. - subarch_entries) / 4
* up already except stack. We just set up stack here. Then call
* start_secondary().
*/
-ENTRY(start_cpu0)
+SYM_FUNC_START(start_cpu0)
movl initial_stack, %ecx
movl %ecx, %esp
call *(initial_code)
1: jmp 1b
-ENDPROC(start_cpu0)
+SYM_FUNC_END(start_cpu0)
#endif
/*
@@ -195,7 +196,7 @@ ENDPROC(start_cpu0)
* If cpu hotplug is not supported then this code can go in init section
* which will be freed later
*/
-ENTRY(startup_32_smp)
+SYM_FUNC_START(startup_32_smp)
cld
movl $(__BOOT_DS),%eax
movl %eax,%ds
@@ -362,7 +363,7 @@ ENTRY(startup_32_smp)
call *(initial_code)
1: jmp 1b
-ENDPROC(startup_32_smp)
+SYM_FUNC_END(startup_32_smp)
#include "verify_cpu.S"
@@ -392,7 +393,7 @@ setup_once:
andl $0,setup_once_ref /* Once is enough, thanks */
ret
-ENTRY(early_idt_handler_array)
+SYM_FUNC_START(early_idt_handler_array)
# 36(%esp) %eflags
# 32(%esp) %cs
# 28(%esp) %eip
@@ -407,9 +408,9 @@ ENTRY(early_idt_handler_array)
i = i + 1
.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
-ENDPROC(early_idt_handler_array)
+SYM_FUNC_END(early_idt_handler_array)
-early_idt_handler_common:
+SYM_CODE_START_LOCAL(early_idt_handler_common)
/*
* The stack is the hardware frame, an error code or zero, and the
* vector number.
@@ -460,10 +461,10 @@ early_idt_handler_common:
decl %ss:early_recursion_flag
addl $4, %esp /* pop pt_regs->orig_ax */
iret
-ENDPROC(early_idt_handler_common)
+SYM_CODE_END(early_idt_handler_common)
/* This is the default interrupt "handler" :-) */
-ENTRY(early_ignore_irq)
+SYM_FUNC_START(early_ignore_irq)
cld
#ifdef CONFIG_PRINTK
pushl %eax
@@ -498,19 +499,16 @@ ENTRY(early_ignore_irq)
hlt_loop:
hlt
jmp hlt_loop
-ENDPROC(early_ignore_irq)
+SYM_FUNC_END(early_ignore_irq)
__INITDATA
.align 4
-GLOBAL(early_recursion_flag)
- .long 0
+SYM_DATA(early_recursion_flag, .long 0)
__REFDATA
.align 4
-ENTRY(initial_code)
- .long i386_start_kernel
-ENTRY(setup_once_ref)
- .long setup_once
+SYM_DATA(initial_code, .long i386_start_kernel)
+SYM_DATA(setup_once_ref, .long setup_once)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
#define PGD_ALIGN (2 * PAGE_SIZE)
@@ -553,7 +551,7 @@ EXPORT_SYMBOL(empty_zero_page)
__PAGE_ALIGNED_DATA
/* Page-aligned for the benefit of paravirt? */
.align PGD_ALIGN
-ENTRY(initial_page_table)
+SYM_DATA_START(initial_page_table)
.long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
# if KPMDS == 3
.long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
@@ -571,17 +569,28 @@ ENTRY(initial_page_table)
# error "Kernel PMDs should be 1, 2 or 3"
# endif
.align PAGE_SIZE /* needs to be page-sized too */
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /*
+ * PTI needs another page so sync_initial_pagetable() works correctly
+ * and does not scribble over the data which is placed behind the
+ * actual initial_page_table. See clone_pgd_range().
+ */
+ .fill 1024, 4, 0
+#endif
+
+SYM_DATA_END(initial_page_table)
#endif
.data
.balign 4
-ENTRY(initial_stack)
- /*
- * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
- * unwinder reliably detect the end of the stack.
- */
- .long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \
- TOP_OF_KERNEL_STACK_PADDING;
+/*
+ * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder
+ * reliably detect the end of the stack.
+ */
+SYM_DATA(initial_stack,
+ .long init_thread_union + THREAD_SIZE -
+ SIZEOF_PTREGS - TOP_OF_KERNEL_STACK_PADDING)
__INITRODATA
int_msg:
@@ -597,27 +606,28 @@ int_msg:
*/
.data
-.globl boot_gdt_descr
-
ALIGN
# early boot GDT descriptor (must use 1:1 address mapping)
.word 0 # 32 bit align gdt_desc.address
-boot_gdt_descr:
+SYM_DATA_START_LOCAL(boot_gdt_descr)
.word __BOOT_DS+7
.long boot_gdt - __PAGE_OFFSET
+SYM_DATA_END(boot_gdt_descr)
# boot GDT descriptor (later on used by CPU#0):
.word 0 # 32 bit align gdt_desc.address
-ENTRY(early_gdt_descr)
+SYM_DATA_START(early_gdt_descr)
.word GDT_ENTRIES*8-1
.long gdt_page /* Overwritten for secondary CPUs */
+SYM_DATA_END(early_gdt_descr)
/*
* The boot_gdt must mirror the equivalent in setup.S and is
* used only for booting.
*/
.align L1_CACHE_BYTES
-ENTRY(boot_gdt)
+SYM_DATA_START(boot_gdt)
.fill GDT_ENTRY_BOOT_CS,8,0
.quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
.quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
+SYM_DATA_END(boot_gdt)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index f3d3e9646a99..4bbc770af632 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -49,8 +49,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
.text
__HEAD
.code64
- .globl startup_64
-startup_64:
+SYM_CODE_START_NOALIGN(startup_64)
UNWIND_HINT_EMPTY
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
@@ -90,7 +89,9 @@ startup_64:
/* Form the CR3 value being sure to include the CR3 modifier */
addq $(early_top_pgt - __START_KERNEL_map), %rax
jmp 1f
-ENTRY(secondary_startup_64)
+SYM_CODE_END(startup_64)
+
+SYM_CODE_START(secondary_startup_64)
UNWIND_HINT_EMPTY
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
@@ -240,7 +241,7 @@ ENTRY(secondary_startup_64)
pushq %rax # target address in negative space
lretq
.Lafter_lret:
-END(secondary_startup_64)
+SYM_CODE_END(secondary_startup_64)
#include "verify_cpu.S"
@@ -250,30 +251,28 @@ END(secondary_startup_64)
* up already except stack. We just set up stack here. Then call
* start_secondary() via .Ljump_to_C_code.
*/
-ENTRY(start_cpu0)
+SYM_CODE_START(start_cpu0)
UNWIND_HINT_EMPTY
movq initial_stack(%rip), %rsp
jmp .Ljump_to_C_code
-END(start_cpu0)
+SYM_CODE_END(start_cpu0)
#endif
/* Both SMP bootup and ACPI suspend change these variables */
__REFDATA
.balign 8
- GLOBAL(initial_code)
- .quad x86_64_start_kernel
- GLOBAL(initial_gs)
- .quad INIT_PER_CPU_VAR(fixed_percpu_data)
- GLOBAL(initial_stack)
- /*
- * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
- * unwinder reliably detect the end of the stack.
- */
- .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
+SYM_DATA(initial_code, .quad x86_64_start_kernel)
+SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data))
+
+/*
+ * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder
+ * reliably detect the end of the stack.
+ */
+SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS)
__FINITDATA
__INIT
-ENTRY(early_idt_handler_array)
+SYM_CODE_START(early_idt_handler_array)
i = 0
.rept NUM_EXCEPTION_VECTORS
.if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
@@ -289,9 +288,9 @@ ENTRY(early_idt_handler_array)
.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
UNWIND_HINT_IRET_REGS offset=16
-END(early_idt_handler_array)
+SYM_CODE_END(early_idt_handler_array)
-early_idt_handler_common:
+SYM_CODE_START_LOCAL(early_idt_handler_common)
/*
* The stack is the hardware frame, an error code or zero, and the
* vector number.
@@ -333,17 +332,11 @@ early_idt_handler_common:
20:
decl early_recursion_flag(%rip)
jmp restore_regs_and_return_to_kernel
-END(early_idt_handler_common)
+SYM_CODE_END(early_idt_handler_common)
- __INITDATA
- .balign 4
-GLOBAL(early_recursion_flag)
- .long 0
-
-#define NEXT_PAGE(name) \
- .balign PAGE_SIZE; \
-GLOBAL(name)
+#define SYM_DATA_START_PAGE_ALIGNED(name) \
+ SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
@@ -358,11 +351,11 @@ GLOBAL(name)
*/
#define PTI_USER_PGD_FILL 512
/* This ensures they are 8k-aligned: */
-#define NEXT_PGD_PAGE(name) \
- .balign 2 * PAGE_SIZE; \
-GLOBAL(name)
+#define SYM_DATA_START_PTI_ALIGNED(name) \
+ SYM_START(name, SYM_L_GLOBAL, .balign 2 * PAGE_SIZE)
#else
-#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define SYM_DATA_START_PTI_ALIGNED(name) \
+ SYM_DATA_START_PAGE_ALIGNED(name)
#define PTI_USER_PGD_FILL 0
#endif
@@ -375,17 +368,23 @@ GLOBAL(name)
.endr
__INITDATA
-NEXT_PGD_PAGE(early_top_pgt)
+ .balign 4
+
+SYM_DATA_START_PTI_ALIGNED(early_top_pgt)
.fill 512,8,0
.fill PTI_USER_PGD_FILL,8,0
+SYM_DATA_END(early_top_pgt)
-NEXT_PAGE(early_dynamic_pgts)
+SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
+SYM_DATA_END(early_dynamic_pgts)
+
+SYM_DATA(early_recursion_flag, .long 0)
.data
#if defined(CONFIG_XEN_PV) || defined(CONFIG_PVH)
-NEXT_PGD_PAGE(init_top_pgt)
+SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + L4_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
@@ -393,11 +392,13 @@ NEXT_PGD_PAGE(init_top_pgt)
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
.fill PTI_USER_PGD_FILL,8,0
+SYM_DATA_END(init_top_pgt)
-NEXT_PAGE(level3_ident_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.fill 511, 8, 0
-NEXT_PAGE(level2_ident_pgt)
+SYM_DATA_END(level3_ident_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level2_ident_pgt)
/*
* Since I easily can, map the first 1G.
* Don't set NX because code runs from these pages.
@@ -407,25 +408,29 @@ NEXT_PAGE(level2_ident_pgt)
* the CPU should ignore the bit.
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+SYM_DATA_END(level2_ident_pgt)
#else
-NEXT_PGD_PAGE(init_top_pgt)
+SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
.fill 512,8,0
.fill PTI_USER_PGD_FILL,8,0
+SYM_DATA_END(init_top_pgt)
#endif
#ifdef CONFIG_X86_5LEVEL
-NEXT_PAGE(level4_kernel_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt)
.fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+SYM_DATA_END(level4_kernel_pgt)
#endif
-NEXT_PAGE(level3_kernel_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
.quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+SYM_DATA_END(level3_kernel_pgt)
-NEXT_PAGE(level2_kernel_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt)
/*
* 512 MB kernel mapping. We spend a full page on this pagetable
* anyway.
@@ -442,8 +447,9 @@ NEXT_PAGE(level2_kernel_pgt)
*/
PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
KERNEL_IMAGE_SIZE/PMD_SIZE)
+SYM_DATA_END(level2_kernel_pgt)
-NEXT_PAGE(level2_fixmap_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt)
.fill (512 - 4 - FIXMAP_PMD_NUM),8,0
pgtno = 0
.rept (FIXMAP_PMD_NUM)
@@ -453,31 +459,32 @@ NEXT_PAGE(level2_fixmap_pgt)
.endr
/* 6 MB reserved space + a 2MB hole */
.fill 4,8,0
+SYM_DATA_END(level2_fixmap_pgt)
-NEXT_PAGE(level1_fixmap_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt)
.rept (FIXMAP_PMD_NUM)
.fill 512,8,0
.endr
+SYM_DATA_END(level1_fixmap_pgt)
#undef PMDS
.data
.align 16
- .globl early_gdt_descr
-early_gdt_descr:
- .word GDT_ENTRIES*8-1
-early_gdt_descr_base:
- .quad INIT_PER_CPU_VAR(gdt_page)
-
-ENTRY(phys_base)
- /* This must match the first entry in level2_kernel_pgt */
- .quad 0x0000000000000000
+
+SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1)
+SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page))
+
+ .align 16
+/* This must match the first entry in level2_kernel_pgt */
+SYM_DATA(phys_base, .quad 0x0)
EXPORT_SYMBOL(phys_base)
#include "../../x86/xen/xen-head.S"
__PAGE_ALIGNED_BSS
-NEXT_PAGE(empty_zero_page)
+SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
.skip PAGE_SIZE
+SYM_DATA_END(empty_zero_page)
EXPORT_SYMBOL(empty_zero_page)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index c6f791bc481e..7a50f0b62a70 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -84,7 +84,7 @@ static inline void hpet_writel(unsigned int d, unsigned int a)
static inline void hpet_set_mapping(void)
{
- hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+ hpet_virt_address = ioremap(hpet_address, HPET_MMAP_SIZE);
}
static inline void hpet_clear_mapping(void)
diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c
index 4c407833faca..4d4f5d9faac3 100644
--- a/arch/x86/kernel/ima_arch.c
+++ b/arch/x86/kernel/ima_arch.c
@@ -74,9 +74,9 @@ bool arch_ima_get_secureboot(void)
/* secureboot arch rules */
static const char * const sb_arch_rules[] = {
-#if !IS_ENABLED(CONFIG_KEXEC_VERIFY_SIG)
+#if !IS_ENABLED(CONFIG_KEXEC_SIG)
"appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig",
-#endif /* CONFIG_KEXEC_VERIFY_SIG */
+#endif /* CONFIG_KEXEC_SIG */
"measure func=KEXEC_KERNEL_CHECK",
#if !IS_ENABLED(CONFIG_MODULE_SIG)
"appraise func=MODULE_CHECK appraise_type=imasig",
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 0fe1c8782208..8abeee0dd7bf 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -3,35 +3,74 @@
* This contains the io-permission bitmap code - written by obz, with changes
* by Linus. 32/64 bits code unification by Miguel BotĂłn.
*/
-
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/kernel.h>
#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/ioport.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/thread_info.h>
+#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/bitmap.h>
-#include <asm/syscalls.h>
+#include <linux/ioport.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/io_bitmap.h>
#include <asm/desc.h>
+#ifdef CONFIG_X86_IOPL_IOPERM
+
+static atomic64_t io_bitmap_sequence;
+
+void io_bitmap_share(struct task_struct *tsk)
+{
+ /* Can be NULL when current->thread.iopl_emul == 3 */
+ if (current->thread.io_bitmap) {
+ /*
+ * Take a refcount on current's bitmap. It can be used by
+ * both tasks as long as none of them changes the bitmap.
+ */
+ refcount_inc(&current->thread.io_bitmap->refcnt);
+ tsk->thread.io_bitmap = current->thread.io_bitmap;
+ }
+ set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
+}
+
+static void task_update_io_bitmap(void)
+{
+ struct thread_struct *t = &current->thread;
+
+ if (t->iopl_emul == 3 || t->io_bitmap) {
+ /* TSS update is handled on exit to user space */
+ set_thread_flag(TIF_IO_BITMAP);
+ } else {
+ clear_thread_flag(TIF_IO_BITMAP);
+ /* Invalidate TSS */
+ preempt_disable();
+ tss_update_io_bitmap();
+ preempt_enable();
+ }
+}
+
+void io_bitmap_exit(void)
+{
+ struct io_bitmap *iobm = current->thread.io_bitmap;
+
+ current->thread.io_bitmap = NULL;
+ task_update_io_bitmap();
+ if (iobm && refcount_dec_and_test(&iobm->refcnt))
+ kfree(iobm);
+}
+
/*
- * this changes the io permissions bitmap in the current task.
+ * This changes the io permissions bitmap in the current task.
*/
long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
{
struct thread_struct *t = &current->thread;
- struct tss_struct *tss;
- unsigned int i, max_long, bytes, bytes_updated;
+ unsigned int i, max_long;
+ struct io_bitmap *iobm;
if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
return -EINVAL;
- if (turn_on && !capable(CAP_SYS_RAWIO))
+ if (turn_on && (!capable(CAP_SYS_RAWIO) ||
+ security_locked_down(LOCKDOWN_IOPORT)))
return -EPERM;
/*
@@ -39,59 +78,72 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
* IO bitmap up. ioperm() is much less timing critical than clone(),
* this is why we delay this operation until now:
*/
- if (!t->io_bitmap_ptr) {
- unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
-
- if (!bitmap)
+ iobm = t->io_bitmap;
+ if (!iobm) {
+ /* No point to allocate a bitmap just to clear permissions */
+ if (!turn_on)
+ return 0;
+ iobm = kmalloc(sizeof(*iobm), GFP_KERNEL);
+ if (!iobm)
return -ENOMEM;
- memset(bitmap, 0xff, IO_BITMAP_BYTES);
- t->io_bitmap_ptr = bitmap;
- set_thread_flag(TIF_IO_BITMAP);
+ memset(iobm->bitmap, 0xff, sizeof(iobm->bitmap));
+ refcount_set(&iobm->refcnt, 1);
+ }
- /*
- * Now that we have an IO bitmap, we need our TSS limit to be
- * correct. It's fine if we are preempted after doing this:
- * with TIF_IO_BITMAP set, context switches will keep our TSS
- * limit correct.
- */
- preempt_disable();
- refresh_tss_limit();
- preempt_enable();
+ /*
+ * If the bitmap is not shared, then nothing can take a refcount as
+ * current can obviously not fork at the same time. If it's shared
+ * duplicate it and drop the refcount on the original one.
+ */
+ if (refcount_read(&iobm->refcnt) > 1) {
+ iobm = kmemdup(iobm, sizeof(*iobm), GFP_KERNEL);
+ if (!iobm)
+ return -ENOMEM;
+ refcount_set(&iobm->refcnt, 1);
+ io_bitmap_exit();
}
/*
- * do it in the per-thread copy and in the TSS ...
- *
- * Disable preemption via get_cpu() - we must not switch away
- * because the ->io_bitmap_max value must match the bitmap
- * contents:
+ * Store the bitmap pointer (might be the same if the task already
+ * head one). Must be done here so freeing the bitmap when all
+ * permissions are dropped has the pointer set up.
*/
- tss = &per_cpu(cpu_tss_rw, get_cpu());
+ t->io_bitmap = iobm;
+ /* Mark it active for context switching and exit to user mode */
+ set_thread_flag(TIF_IO_BITMAP);
+ /*
+ * Update the tasks bitmap. The update of the TSS bitmap happens on
+ * exit to user mode. So this needs no protection.
+ */
if (turn_on)
- bitmap_clear(t->io_bitmap_ptr, from, num);
+ bitmap_clear(iobm->bitmap, from, num);
else
- bitmap_set(t->io_bitmap_ptr, from, num);
+ bitmap_set(iobm->bitmap, from, num);
/*
* Search for a (possibly new) maximum. This is simple and stupid,
* to keep it obviously correct:
*/
- max_long = 0;
- for (i = 0; i < IO_BITMAP_LONGS; i++)
- if (t->io_bitmap_ptr[i] != ~0UL)
+ max_long = UINT_MAX;
+ for (i = 0; i < IO_BITMAP_LONGS; i++) {
+ if (iobm->bitmap[i] != ~0UL)
max_long = i;
+ }
+ /* All permissions dropped? */
+ if (max_long == UINT_MAX) {
+ io_bitmap_exit();
+ return 0;
+ }
- bytes = (max_long + 1) * sizeof(unsigned long);
- bytes_updated = max(bytes, t->io_bitmap_max);
-
- t->io_bitmap_max = bytes;
-
- /* Update the TSS: */
- memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+ iobm->max = (max_long + 1) * sizeof(unsigned long);
- put_cpu();
+ /*
+ * Update the sequence number to force a TSS update on return to
+ * user mode.
+ */
+ iobm->sequence = atomic64_add_return(1, &io_bitmap_sequence);
return 0;
}
@@ -102,37 +154,61 @@ SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
}
/*
- * sys_iopl has to be used when you want to access the IO ports
- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
- * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ * The sys_iopl functionality depends on the level argument, which if
+ * granted for the task is used to enable access to all 65536 I/O ports.
+ *
+ * This does not use the IOPL mechanism provided by the CPU as that would
+ * also allow the user space task to use the CLI/STI instructions.
*
- * Here we just change the flags value on the stack: we allow
- * only the super-user to do it. This depends on the stack-layout
- * on system-call entry - see also fork() and the signal handling
- * code.
+ * Disabling interrupts in a user space task is dangerous as it might lock
+ * up the machine and the semantics vs. syscalls and exceptions is
+ * undefined.
+ *
+ * Setting IOPL to level 0-2 is disabling I/O permissions. Level 3
+ * 3 enables them.
+ *
+ * IOPL is strictly per thread and inherited on fork.
*/
SYSCALL_DEFINE1(iopl, unsigned int, level)
{
- struct pt_regs *regs = current_pt_regs();
struct thread_struct *t = &current->thread;
-
- /*
- * Careful: the IOPL bits in regs->flags are undefined under Xen PV
- * and changing them has no effect.
- */
- unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT;
+ unsigned int old;
if (level > 3)
return -EINVAL;
+
+ old = t->iopl_emul;
+
+ /* No point in going further if nothing changes */
+ if (level == old)
+ return 0;
+
/* Trying to gain more privileges? */
if (level > old) {
- if (!capable(CAP_SYS_RAWIO))
+ if (!capable(CAP_SYS_RAWIO) ||
+ security_locked_down(LOCKDOWN_IOPORT))
return -EPERM;
}
- regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
- (level << X86_EFLAGS_IOPL_BIT);
- t->iopl = level << X86_EFLAGS_IOPL_BIT;
- set_iopl_mask(t->iopl);
+
+ t->iopl_emul = level;
+ task_update_io_bitmap();
return 0;
}
+
+#else /* CONFIG_X86_IOPL_IOPERM */
+
+long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+ return -ENOSYS;
+}
+SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
+{
+ return -ENOSYS;
+}
+
+SYSCALL_DEFINE1(iopl, unsigned int, level)
+{
+ return -ENOSYS;
+}
+#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 4215653f8a8e..21efee32e2b1 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -243,11 +243,15 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
desc = __this_cpu_read(vector_irq[vector]);
-
- if (!handle_irq(desc, regs)) {
+ if (likely(!IS_ERR_OR_NULL(desc))) {
+ if (IS_ENABLED(CONFIG_X86_32))
+ handle_irq(desc, regs);
+ else
+ generic_handle_irq_desc(desc);
+ } else {
ack_APIC_irq();
- if (desc != VECTOR_RETRIGGERED && desc != VECTOR_SHUTDOWN) {
+ if (desc == VECTOR_UNUSED) {
pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
__func__, smp_processor_id(),
vector);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index fc34816c6f04..a759ca97cd01 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -148,18 +148,13 @@ void do_softirq_own_stack(void)
call_on_stack(__do_softirq, isp);
}
-bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
+void handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
int overflow = check_stack_overflow();
- if (IS_ERR_OR_NULL(desc))
- return false;
-
if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) {
if (unlikely(overflow))
print_stack_overflow();
generic_handle_irq_desc(desc);
}
-
- return true;
}
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 6bf6517a05bb..12df3a4abfdd 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -26,15 +26,6 @@
DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible;
DECLARE_INIT_PER_CPU(irq_stack_backing_store);
-bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
-{
- if (IS_ERR_OR_NULL(desc))
- return false;
-
- generic_handle_irq_desc(desc);
- return true;
-}
-
#ifdef CONFIG_VMAP_STACK
/*
* VMAP the backing store with guard pages
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index ddeeaac8adda..0db0375235b4 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -7,20 +7,20 @@
/*
* unsigned long native_save_fl(void)
*/
-ENTRY(native_save_fl)
+SYM_FUNC_START(native_save_fl)
pushf
pop %_ASM_AX
ret
-ENDPROC(native_save_fl)
+SYM_FUNC_END(native_save_fl)
EXPORT_SYMBOL(native_save_fl)
/*
* void native_restore_fl(unsigned long flags)
* %eax/%rdi: flags
*/
-ENTRY(native_restore_fl)
+SYM_FUNC_START(native_restore_fl)
push %_ASM_ARG1
popf
ret
-ENDPROC(native_restore_fl)
+SYM_FUNC_END(native_restore_fl)
EXPORT_SYMBOL(native_restore_fl)
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index 3ad34f01de2a..6eb8b50ea07e 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -11,6 +11,7 @@
#include <linux/acpi_pmtmr.h>
#include <linux/kernel.h>
#include <linux/reboot.h>
+#include <linux/serial_8250.h>
#include <asm/apic.h>
#include <asm/cpu.h>
#include <asm/hypervisor.h>
@@ -21,9 +22,24 @@
#include <asm/setup.h>
#include <asm/jailhouse_para.h>
-static __initdata struct jailhouse_setup_data setup_data;
+static struct jailhouse_setup_data setup_data;
+#define SETUP_DATA_V1_LEN (sizeof(setup_data.hdr) + sizeof(setup_data.v1))
+#define SETUP_DATA_V2_LEN (SETUP_DATA_V1_LEN + sizeof(setup_data.v2))
+
static unsigned int precalibrated_tsc_khz;
+static void jailhouse_setup_irq(unsigned int irq)
+{
+ struct mpc_intsrc mp_irq = {
+ .type = MP_INTSRC,
+ .irqtype = mp_INT,
+ .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE,
+ .srcbusirq = irq,
+ .dstirq = irq,
+ };
+ mp_save_irq(&mp_irq);
+}
+
static uint32_t jailhouse_cpuid_base(void)
{
if (boot_cpu_data.cpuid_level < 0 ||
@@ -45,7 +61,7 @@ static void jailhouse_get_wallclock(struct timespec64 *now)
static void __init jailhouse_timer_init(void)
{
- lapic_timer_period = setup_data.apic_khz * (1000 / HZ);
+ lapic_timer_period = setup_data.v1.apic_khz * (1000 / HZ);
}
static unsigned long jailhouse_get_tsc(void)
@@ -77,33 +93,28 @@ static void __init jailhouse_get_smp_config(unsigned int early)
.type = IOAPIC_DOMAIN_STRICT,
.ops = &mp_ioapic_irqdomain_ops,
};
- struct mpc_intsrc mp_irq = {
- .type = MP_INTSRC,
- .irqtype = mp_INT,
- .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE,
- };
unsigned int cpu;
jailhouse_x2apic_init();
register_lapic_address(0xfee00000);
- for (cpu = 0; cpu < setup_data.num_cpus; cpu++) {
- generic_processor_info(setup_data.cpu_ids[cpu],
+ for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) {
+ generic_processor_info(setup_data.v1.cpu_ids[cpu],
boot_cpu_apic_version);
}
smp_found_config = 1;
- if (setup_data.standard_ioapic) {
+ if (setup_data.v1.standard_ioapic) {
mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg);
- /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */
- mp_irq.srcbusirq = mp_irq.dstirq = 3;
- mp_save_irq(&mp_irq);
-
- mp_irq.srcbusirq = mp_irq.dstirq = 4;
- mp_save_irq(&mp_irq);
+ if (IS_ENABLED(CONFIG_SERIAL_8250) &&
+ setup_data.hdr.version < 2) {
+ /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */
+ jailhouse_setup_irq(3);
+ jailhouse_setup_irq(4);
+ }
}
}
@@ -126,9 +137,9 @@ static int __init jailhouse_pci_arch_init(void)
pcibios_last_bus = 0xff;
#ifdef CONFIG_PCI_MMCONFIG
- if (setup_data.pci_mmconfig_base) {
+ if (setup_data.v1.pci_mmconfig_base) {
pci_mmconfig_add(0, 0, pcibios_last_bus,
- setup_data.pci_mmconfig_base);
+ setup_data.v1.pci_mmconfig_base);
pci_mmcfg_arch_init();
}
#endif
@@ -136,9 +147,57 @@ static int __init jailhouse_pci_arch_init(void)
return 0;
}
+#ifdef CONFIG_SERIAL_8250
+static inline bool jailhouse_uart_enabled(unsigned int uart_nr)
+{
+ return setup_data.v2.flags & BIT(uart_nr);
+}
+
+static void jailhouse_serial_fixup(int port, struct uart_port *up,
+ u32 *capabilities)
+{
+ static const u16 pcuart_base[] = {0x3f8, 0x2f8, 0x3e8, 0x2e8};
+ unsigned int n;
+
+ for (n = 0; n < ARRAY_SIZE(pcuart_base); n++) {
+ if (pcuart_base[n] != up->iobase)
+ continue;
+
+ if (jailhouse_uart_enabled(n)) {
+ pr_info("Enabling UART%u (port 0x%lx)\n", n,
+ up->iobase);
+ jailhouse_setup_irq(up->irq);
+ } else {
+ /* Deactivate UART if access isn't allowed */
+ up->iobase = 0;
+ }
+ break;
+ }
+}
+
+static void __init jailhouse_serial_workaround(void)
+{
+ /*
+ * There are flags inside setup_data that indicate availability of
+ * platform UARTs since setup data version 2.
+ *
+ * In case of version 1, we don't know which UARTs belong Linux. In
+ * this case, unconditionally register 1:1 mapping for legacy UART IRQs
+ * 3 and 4.
+ */
+ if (setup_data.hdr.version > 1)
+ serial8250_set_isa_configurator(jailhouse_serial_fixup);
+}
+#else /* !CONFIG_SERIAL_8250 */
+static inline void jailhouse_serial_workaround(void)
+{
+}
+#endif /* CONFIG_SERIAL_8250 */
+
static void __init jailhouse_init_platform(void)
{
u64 pa_data = boot_params.hdr.setup_data;
+ unsigned long setup_data_len;
struct setup_data header;
void *mapping;
@@ -163,16 +222,8 @@ static void __init jailhouse_init_platform(void)
memcpy(&header, mapping, sizeof(header));
early_memunmap(mapping, sizeof(header));
- if (header.type == SETUP_JAILHOUSE &&
- header.len >= sizeof(setup_data)) {
- pa_data += offsetof(struct setup_data, data);
-
- mapping = early_memremap(pa_data, sizeof(setup_data));
- memcpy(&setup_data, mapping, sizeof(setup_data));
- early_memunmap(mapping, sizeof(setup_data));
-
+ if (header.type == SETUP_JAILHOUSE)
break;
- }
pa_data = header.next;
}
@@ -180,13 +231,28 @@ static void __init jailhouse_init_platform(void)
if (!pa_data)
panic("Jailhouse: No valid setup data found");
- if (setup_data.compatible_version > JAILHOUSE_SETUP_REQUIRED_VERSION)
- panic("Jailhouse: Unsupported setup data structure");
-
- pmtmr_ioport = setup_data.pm_timer_address;
+ /* setup data must at least contain the header */
+ if (header.len < sizeof(setup_data.hdr))
+ goto unsupported;
+
+ pa_data += offsetof(struct setup_data, data);
+ setup_data_len = min_t(unsigned long, sizeof(setup_data),
+ (unsigned long)header.len);
+ mapping = early_memremap(pa_data, setup_data_len);
+ memcpy(&setup_data, mapping, setup_data_len);
+ early_memunmap(mapping, setup_data_len);
+
+ if (setup_data.hdr.version == 0 ||
+ setup_data.hdr.compatible_version !=
+ JAILHOUSE_SETUP_REQUIRED_VERSION ||
+ (setup_data.hdr.version == 1 && header.len < SETUP_DATA_V1_LEN) ||
+ (setup_data.hdr.version >= 2 && header.len < SETUP_DATA_V2_LEN))
+ goto unsupported;
+
+ pmtmr_ioport = setup_data.v1.pm_timer_address;
pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport);
- precalibrated_tsc_khz = setup_data.tsc_khz;
+ precalibrated_tsc_khz = setup_data.v1.tsc_khz;
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
pci_probe = 0;
@@ -196,6 +262,12 @@ static void __init jailhouse_init_platform(void)
* are none in a non-root cell.
*/
disable_acpi();
+
+ jailhouse_serial_workaround();
+ return;
+
+unsupported:
+ panic("Jailhouse: Unsupported setup data structure");
}
bool jailhouse_paravirt(void)
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 044053235302..9c4498ea0b3c 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -16,15 +16,7 @@
#include <asm/alternative.h>
#include <asm/text-patching.h>
-union jump_code_union {
- char code[JUMP_LABEL_NOP_SIZE];
- struct {
- char jump;
- int offset;
- } __attribute__((packed));
-};
-
-static void bug_at(unsigned char *ip, int line)
+static void bug_at(const void *ip, int line)
{
/*
* The location is not an op that we were expecting.
@@ -35,42 +27,42 @@ static void bug_at(unsigned char *ip, int line)
BUG();
}
-static void __jump_label_set_jump_code(struct jump_entry *entry,
- enum jump_label_type type,
- union jump_code_union *code,
- int init)
+static const void *
+__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init)
{
const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
- const void *expect;
+ const void *expect, *code;
+ const void *addr, *dest;
int line;
- code->jump = 0xe9;
- code->offset = jump_entry_target(entry) -
- (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
+ addr = (void *)jump_entry_code(entry);
+ dest = (void *)jump_entry_target(entry);
+
+ code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
if (init) {
expect = default_nop; line = __LINE__;
} else if (type == JUMP_LABEL_JMP) {
expect = ideal_nop; line = __LINE__;
} else {
- expect = code->code; line = __LINE__;
+ expect = code; line = __LINE__;
}
- if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE))
- bug_at((void *)jump_entry_code(entry), line);
+ if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE))
+ bug_at(addr, line);
if (type == JUMP_LABEL_NOP)
- memcpy(code, ideal_nop, JUMP_LABEL_NOP_SIZE);
+ code = ideal_nop;
+
+ return code;
}
-static void __ref __jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type,
- int init)
+static void inline __jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type,
+ int init)
{
- union jump_code_union code;
-
- __jump_label_set_jump_code(entry, type, &code, init);
+ const void *opcode = __jump_label_set_jump_code(entry, type, init);
/*
* As long as only a single processor is running and the code is still
@@ -84,32 +76,33 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
* always nop being the 'currently valid' instruction
*/
if (init || system_state == SYSTEM_BOOTING) {
- text_poke_early((void *)jump_entry_code(entry), &code,
+ text_poke_early((void *)jump_entry_code(entry), opcode,
JUMP_LABEL_NOP_SIZE);
return;
}
- text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE,
- (void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
+ text_poke_bp((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL);
}
-void arch_jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type)
+static void __ref jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type,
+ int init)
{
mutex_lock(&text_mutex);
- __jump_label_transform(entry, type, 0);
+ __jump_label_transform(entry, type, init);
mutex_unlock(&text_mutex);
}
-#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
-static struct text_poke_loc tp_vec[TP_VEC_MAX];
-static int tp_vec_nr;
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ jump_label_transform(entry, type, 0);
+}
bool arch_jump_label_transform_queue(struct jump_entry *entry,
enum jump_label_type type)
{
- struct text_poke_loc *tp;
- void *entry_code;
+ const void *opcode;
if (system_state == SYSTEM_BOOTING) {
/*
@@ -119,55 +112,19 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry,
return true;
}
- /*
- * No more space in the vector, tell upper layer to apply
- * the queue before continuing.
- */
- if (tp_vec_nr == TP_VEC_MAX)
- return false;
-
- tp = &tp_vec[tp_vec_nr];
-
- entry_code = (void *)jump_entry_code(entry);
-
- /*
- * The INT3 handler will do a bsearch in the queue, so we need entries
- * to be sorted. We can survive an unsorted list by rejecting the entry,
- * forcing the generic jump_label code to apply the queue. Warning once,
- * to raise the attention to the case of an unsorted entry that is
- * better not happen, because, in the worst case we will perform in the
- * same way as we do without batching - with some more overhead.
- */
- if (tp_vec_nr > 0) {
- int prev = tp_vec_nr - 1;
- struct text_poke_loc *prev_tp = &tp_vec[prev];
-
- if (WARN_ON_ONCE(prev_tp->addr > entry_code))
- return false;
- }
-
- __jump_label_set_jump_code(entry, type,
- (union jump_code_union *) &tp->opcode, 0);
-
- tp->addr = entry_code;
- tp->detour = entry_code + JUMP_LABEL_NOP_SIZE;
- tp->len = JUMP_LABEL_NOP_SIZE;
-
- tp_vec_nr++;
-
+ mutex_lock(&text_mutex);
+ opcode = __jump_label_set_jump_code(entry, type, 0);
+ text_poke_queue((void *)jump_entry_code(entry),
+ opcode, JUMP_LABEL_NOP_SIZE, NULL);
+ mutex_unlock(&text_mutex);
return true;
}
void arch_jump_label_transform_apply(void)
{
- if (!tp_vec_nr)
- return;
-
mutex_lock(&text_mutex);
- text_poke_bp_batch(tp_vec, tp_vec_nr);
+ text_poke_finish();
mutex_unlock(&text_mutex);
-
- tp_vec_nr = 0;
}
static enum {
@@ -196,5 +153,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
jlstate = JL_STATE_NO_UPDATE;
}
if (jlstate == JL_STATE_UPDATE)
- __jump_label_transform(entry, type, 1);
+ jump_label_transform(entry, type, 1);
}
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index edaa30b20841..64b6da95af98 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -44,7 +44,12 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
if (count > node->len - pos)
count = node->len - pos;
- pa = node->paddr + sizeof(struct setup_data) + pos;
+ pa = node->paddr + pos;
+
+ /* Is it direct data or invalid indirect one? */
+ if (!(node->type & SETUP_INDIRECT) || node->type == SETUP_INDIRECT)
+ pa += sizeof(struct setup_data);
+
p = memremap(pa, count, MEMREMAP_WB);
if (!p)
return -ENOMEM;
@@ -108,9 +113,17 @@ static int __init create_setup_data_nodes(struct dentry *parent)
goto err_dir;
}
- node->paddr = pa_data;
- node->type = data->type;
- node->len = data->len;
+ if (data->type == SETUP_INDIRECT &&
+ ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) {
+ node->paddr = ((struct setup_indirect *)data->data)->addr;
+ node->type = ((struct setup_indirect *)data->data)->type;
+ node->len = ((struct setup_indirect *)data->data)->len;
+ } else {
+ node->paddr = pa_data;
+ node->type = data->type;
+ node->len = data->len;
+ }
+
create_setup_data_node(d, no, node);
pa_data = data->next;
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 5ebcd02cbca7..f293d872602a 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -177,9 +177,10 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
* acpi_rsdp=<addr> on kernel command line to make second kernel boot
* without efi.
*/
- if (efi_enabled(EFI_OLD_MEMMAP))
+ if (efi_have_uv1_memmap())
return 0;
+ params->secure_boot = boot_params.secure_boot;
ei->efi_loader_signature = current_ei->efi_loader_signature;
ei->efi_systab = current_ei->efi_systab;
ei->efi_systab_hi = current_ei->efi_systab_hi;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 23297ea64f5f..c44fe7d8d9a4 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -416,7 +416,7 @@ static void kgdb_disable_hw_debug(struct pt_regs *regs)
*/
void kgdb_roundup_cpus(void)
{
- apic->send_IPI_allbutself(APIC_DM_NMI);
+ apic_send_IPI_allbutself(NMI_VECTOR);
}
#endif
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 0e0b08008b5a..4d7022a740ab 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -40,6 +40,7 @@
#include <linux/frame.h>
#include <linux/kasan.h>
#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -119,14 +120,14 @@ __synthesize_relative_insn(void *dest, void *from, void *to, u8 op)
/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
void synthesize_reljump(void *dest, void *from, void *to)
{
- __synthesize_relative_insn(dest, from, to, RELATIVEJUMP_OPCODE);
+ __synthesize_relative_insn(dest, from, to, JMP32_INSN_OPCODE);
}
NOKPROBE_SYMBOL(synthesize_reljump);
/* Insert a call instruction at address 'from', which calls address 'to'.*/
void synthesize_relcall(void *dest, void *from, void *to)
{
- __synthesize_relative_insn(dest, from, to, RELATIVECALL_OPCODE);
+ __synthesize_relative_insn(dest, from, to, CALL_INSN_OPCODE);
}
NOKPROBE_SYMBOL(synthesize_relcall);
@@ -301,7 +302,7 @@ static int can_probe(unsigned long paddr)
* Another debugging subsystem might insert this breakpoint.
* In that case, we can't recover it.
*/
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
return 0;
addr += insn.length;
}
@@ -351,8 +352,12 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
kernel_insn_init(insn, dest, MAX_INSN_SIZE);
insn_get_length(insn);
+ /* We can not probe force emulate prefixed instruction */
+ if (insn_has_emulate_prefix(insn))
+ return 0;
+
/* Another subsystem puts a breakpoint, failed to recover */
- if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ if (insn->opcode.bytes[0] == INT3_INSN_OPCODE)
return 0;
/* We should not singlestep on the exception masking instructions */
@@ -396,14 +401,14 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
int len = insn->length;
if (can_boost(insn, p->addr) &&
- MAX_INSN_SIZE - len >= RELATIVEJUMP_SIZE) {
+ MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) {
/*
* These instructions can be executed directly if it
* jumps back to correct address.
*/
synthesize_reljump(buf + len, p->ainsn.insn + len,
p->addr + insn->length);
- len += RELATIVEJUMP_SIZE;
+ len += JMP32_INSN_SIZE;
p->ainsn.boostable = true;
} else {
p->ainsn.boostable = false;
@@ -497,12 +502,14 @@ int arch_prepare_kprobe(struct kprobe *p)
void arch_arm_kprobe(struct kprobe *p)
{
- text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
+ text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1);
+ text_poke_sync();
}
void arch_disarm_kprobe(struct kprobe *p)
{
text_poke(p->addr, &p->opcode, 1);
+ text_poke_sync();
}
void arch_remove_kprobe(struct kprobe *p)
@@ -580,7 +587,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
if (setup_detour_execution(p, regs, reenter))
return;
-#if !defined(CONFIG_PREEMPT)
+#if !defined(CONFIG_PREEMPTION)
if (p->ainsn.boostable && !p->post_handler) {
/* Boost up -- we can execute copied instructions directly */
if (!reenter)
@@ -605,7 +612,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
regs->flags |= X86_EFLAGS_TF;
regs->flags &= ~X86_EFLAGS_IF;
/* single step inline if the instruction is an int3 */
- if (p->opcode == BREAKPOINT_INSTRUCTION)
+ if (p->opcode == INT3_INSN_OPCODE)
regs->ip = (unsigned long)p->addr;
else
regs->ip = (unsigned long)p->ainsn.insn;
@@ -691,7 +698,7 @@ int kprobe_int3_handler(struct pt_regs *regs)
reset_current_kprobe();
return 1;
}
- } else if (*addr != BREAKPOINT_INSTRUCTION) {
+ } else if (*addr != INT3_INSN_OPCODE) {
/*
* The breakpoint instruction was removed right
* after we hit it. Another cpu has removed
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 9d4aedece363..3f45b5c43a71 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -38,7 +38,7 @@ unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
long offs;
int i;
- for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
+ for (i = 0; i < JMP32_INSN_SIZE; i++) {
kp = get_kprobe((void *)addr - i);
/* This function only handles jump-optimized kprobe */
if (kp && kprobe_optimized(kp)) {
@@ -62,10 +62,10 @@ found:
if (addr == (unsigned long)kp->addr) {
buf[0] = kp->opcode;
- memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+ memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE);
} else {
offs = addr - (unsigned long)kp->addr - 1;
- memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
+ memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs);
}
return (unsigned long)buf;
@@ -141,8 +141,6 @@ STACK_FRAME_NON_STANDARD(optprobe_template_func);
#define TMPL_END_IDX \
((long)optprobe_template_end - (long)optprobe_template_entry)
-#define INT3_SIZE sizeof(kprobe_opcode_t)
-
/* Optimized kprobe call back function: called from optinsn */
static void
optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
@@ -162,7 +160,7 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
regs->cs |= get_kernel_rpl();
regs->gs = 0;
#endif
- regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
+ regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE;
regs->orig_ax = ~0UL;
__this_cpu_write(current_kprobe, &op->kp);
@@ -179,7 +177,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
struct insn insn;
int len = 0, ret;
- while (len < RELATIVEJUMP_SIZE) {
+ while (len < JMP32_INSN_SIZE) {
ret = __copy_instruction(dest + len, src + len, real + len, &insn);
if (!ret || !can_boost(&insn, src + len))
return -EINVAL;
@@ -271,7 +269,7 @@ static int can_optimize(unsigned long paddr)
return 0;
/* Check there is enough space for a relative jump. */
- if (size - offset < RELATIVEJUMP_SIZE)
+ if (size - offset < JMP32_INSN_SIZE)
return 0;
/* Decode instructions */
@@ -290,15 +288,15 @@ static int can_optimize(unsigned long paddr)
kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
insn_get_length(&insn);
/* Another subsystem puts a breakpoint */
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
return 0;
/* Recover address */
insn.kaddr = (void *)addr;
insn.next_byte = (void *)(addr + insn.length);
/* Check any instructions don't jump into target */
if (insn_is_indirect_jump(&insn) ||
- insn_jump_into_range(&insn, paddr + INT3_SIZE,
- RELATIVE_ADDR_SIZE))
+ insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE,
+ DISP32_SIZE))
return 0;
addr += insn.length;
}
@@ -374,7 +372,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
* Verify if the address gap is in 2GB range, because this uses
* a relative jump.
*/
- rel = (long)slot - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+ rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE;
if (abs(rel) > 0x7fffffff) {
ret = -ERANGE;
goto err;
@@ -401,9 +399,9 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
/* Set returning jmp instruction at the tail of out-of-line buffer */
synthesize_reljump(buf + len, slot + len,
(u8 *)op->kp.addr + op->optinsn.size);
- len += RELATIVEJUMP_SIZE;
+ len += JMP32_INSN_SIZE;
- /* We have to use text_poke for instuction buffer because it is RO */
+ /* We have to use text_poke() for instruction buffer because it is RO */
text_poke(slot, buf, len);
ret = 0;
out:
@@ -416,44 +414,50 @@ err:
}
/*
- * Replace breakpoints (int3) with relative jumps.
+ * Replace breakpoints (INT3) with relative jumps (JMP.d32).
* Caller must call with locking kprobe_mutex and text_mutex.
+ *
+ * The caller will have installed a regular kprobe and after that issued
+ * syncrhonize_rcu_tasks(), this ensures that the instruction(s) that live in
+ * the 4 bytes after the INT3 are unused and can now be overwritten.
*/
void arch_optimize_kprobes(struct list_head *oplist)
{
struct optimized_kprobe *op, *tmp;
- u8 insn_buff[RELATIVEJUMP_SIZE];
+ u8 insn_buff[JMP32_INSN_SIZE];
list_for_each_entry_safe(op, tmp, oplist, list) {
s32 rel = (s32)((long)op->optinsn.insn -
- ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+ ((long)op->kp.addr + JMP32_INSN_SIZE));
WARN_ON(kprobe_disabled(&op->kp));
/* Backup instructions which will be replaced by jump address */
- memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
- RELATIVE_ADDR_SIZE);
+ memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE,
+ DISP32_SIZE);
- insn_buff[0] = RELATIVEJUMP_OPCODE;
+ insn_buff[0] = JMP32_INSN_OPCODE;
*(s32 *)(&insn_buff[1]) = rel;
- text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
- op->optinsn.insn);
+ text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL);
list_del_init(&op->list);
}
}
-/* Replace a relative jump with a breakpoint (int3). */
+/*
+ * Replace a relative jump (JMP.d32) with a breakpoint (INT3).
+ *
+ * After that, we can restore the 4 bytes after the INT3 to undo what
+ * arch_optimize_kprobes() scribbled. This is safe since those bytes will be
+ * unused once the INT3 lands.
+ */
void arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
- u8 insn_buff[RELATIVEJUMP_SIZE];
-
- /* Set int3 to first byte for kprobes */
- insn_buff[0] = BREAKPOINT_INSTRUCTION;
- memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
- text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
- op->optinsn.insn);
+ arch_arm_kprobe(&op->kp);
+ text_poke(op->kp.addr + INT3_INSN_SIZE,
+ op->optinsn.copied_insn, DISP32_SIZE);
+ text_poke_sync();
}
/*
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index 7969da939213..d0a19121c6a4 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -100,7 +100,12 @@ static int __init get_setup_data_size(int nr, size_t *size)
if (!data)
return -ENOMEM;
if (nr == i) {
- *size = data->len;
+ if (data->type == SETUP_INDIRECT &&
+ ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT)
+ *size = ((struct setup_indirect *)data->data)->len;
+ else
+ *size = data->len;
+
memunmap(data);
return 0;
}
@@ -130,7 +135,10 @@ static ssize_t type_show(struct kobject *kobj,
if (!data)
return -ENOMEM;
- ret = sprintf(buf, "0x%x\n", data->type);
+ if (data->type == SETUP_INDIRECT)
+ ret = sprintf(buf, "0x%x\n", ((struct setup_indirect *)data->data)->type);
+ else
+ ret = sprintf(buf, "0x%x\n", data->type);
memunmap(data);
return ret;
}
@@ -142,7 +150,7 @@ static ssize_t setup_data_data_read(struct file *fp,
loff_t off, size_t count)
{
int nr, ret = 0;
- u64 paddr;
+ u64 paddr, len;
struct setup_data *data;
void *p;
@@ -157,19 +165,28 @@ static ssize_t setup_data_data_read(struct file *fp,
if (!data)
return -ENOMEM;
- if (off > data->len) {
+ if (data->type == SETUP_INDIRECT &&
+ ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) {
+ paddr = ((struct setup_indirect *)data->data)->addr;
+ len = ((struct setup_indirect *)data->data)->len;
+ } else {
+ paddr += sizeof(*data);
+ len = data->len;
+ }
+
+ if (off > len) {
ret = -EINVAL;
goto out;
}
- if (count > data->len - off)
- count = data->len - off;
+ if (count > len - off)
+ count = len - off;
if (!count)
goto out;
ret = count;
- p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB);
+ p = memremap(paddr, len, MEMREMAP_WB);
if (!p) {
ret = -ENOMEM;
goto out;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 4ab377c9fffe..d817f255aed8 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -33,6 +33,7 @@
#include <asm/apicdef.h>
#include <asm/hypervisor.h>
#include <asm/tlb.h>
+#include <asm/cpuidle_haltpoll.h>
static int kvmapf = 1;
@@ -244,17 +245,13 @@ NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
dotraplinkage void
do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
- enum ctx_state prev_state;
-
switch (kvm_read_and_reset_pf_reason()) {
default:
do_page_fault(regs, error_code, address);
break;
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
- prev_state = exception_enter();
kvm_async_pf_task_wait((u32)address, !user_mode(regs));
- exception_exit(prev_state);
break;
case KVM_PV_REASON_PAGE_READY:
rcu_irq_enter();
@@ -311,7 +308,7 @@ static void kvm_guest_cpu_init(void)
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
pa |= KVM_ASYNC_PF_SEND_ALWAYS;
#endif
pa |= KVM_ASYNC_PF_ENABLED;
@@ -502,16 +499,6 @@ static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
__send_ipi_mask(local_mask, vector);
}
-static void kvm_send_ipi_allbutself(int vector)
-{
- kvm_send_ipi_mask_allbutself(cpu_online_mask, vector);
-}
-
-static void kvm_send_ipi_all(int vector)
-{
- __send_ipi_mask(cpu_online_mask, vector);
-}
-
/*
* Set the IPI entry points
*/
@@ -519,8 +506,6 @@ static void kvm_setup_pv_ipi(void)
{
apic->send_IPI_mask = kvm_send_ipi_mask;
apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
- apic->send_IPI_allbutself = kvm_send_ipi_allbutself;
- apic->send_IPI_all = kvm_send_ipi_all;
pr_info("KVM setup pv IPIs\n");
}
@@ -705,6 +690,7 @@ unsigned int kvm_arch_para_hints(void)
{
return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
}
+EXPORT_SYMBOL_GPL(kvm_arch_para_hints);
static uint32_t __init kvm_detect(void)
{
@@ -750,6 +736,9 @@ static __init int kvm_setup_pv_tlb_flush(void)
{
int cpu;
+ if (!kvm_para_available() || nopv)
+ return 0;
+
if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
@@ -867,3 +856,39 @@ void __init kvm_spinlock_init(void)
}
#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+
+static void kvm_disable_host_haltpoll(void *i)
+{
+ wrmsrl(MSR_KVM_POLL_CONTROL, 0);
+}
+
+static void kvm_enable_host_haltpoll(void *i)
+{
+ wrmsrl(MSR_KVM_POLL_CONTROL, 1);
+}
+
+void arch_haltpoll_enable(unsigned int cpu)
+{
+ if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
+ pr_err_once("kvm: host does not support poll control\n");
+ pr_err_once("kvm: host upgrade recommended\n");
+ return;
+ }
+
+ /* Enable guest halt poll disables host halt poll */
+ smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(arch_haltpoll_enable);
+
+void arch_haltpoll_disable(unsigned int cpu)
+{
+ if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
+ return;
+
+ /* Enable guest halt poll disables host halt poll */
+ smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
+#endif
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index b2463fcb20a8..c57e1ca70fd1 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -28,6 +28,89 @@
#include <asm/desc.h>
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
+#include <asm/pgtable_areas.h>
+
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static inline void *ldt_slot_va(int slot)
+{
+ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+}
+
+void load_mm_ldt(struct mm_struct *mm)
+{
+ struct ldt_struct *ldt;
+
+ /* READ_ONCE synchronizes with smp_store_release */
+ ldt = READ_ONCE(mm->context.ldt);
+
+ /*
+ * Any change to mm->context.ldt is followed by an IPI to all
+ * CPUs with the mm active. The LDT will not be freed until
+ * after the IPI is handled by all such CPUs. This means that,
+ * if the ldt_struct changes before we return, the values we see
+ * will be safe, and the new values will be loaded before we run
+ * any user code.
+ *
+ * NB: don't try to convert this to use RCU without extreme care.
+ * We would still need IRQs off, because we don't want to change
+ * the local LDT after an IPI loaded a newer value than the one
+ * that we can see.
+ */
+
+ if (unlikely(ldt)) {
+ if (static_cpu_has(X86_FEATURE_PTI)) {
+ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+ /*
+ * Whoops -- either the new LDT isn't mapped
+ * (if slot == -1) or is mapped into a bogus
+ * slot (if slot > 1).
+ */
+ clear_LDT();
+ return;
+ }
+
+ /*
+ * If page table isolation is enabled, ldt->entries
+ * will not be mapped in the userspace pagetables.
+ * Tell the CPU to access the LDT through the alias
+ * at ldt_slot_va(ldt->slot).
+ */
+ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+ } else {
+ set_ldt(ldt->entries, ldt->nr_entries);
+ }
+ } else {
+ clear_LDT();
+ }
+}
+
+void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
+{
+ /*
+ * Load the LDT if either the old or new mm had an LDT.
+ *
+ * An mm will never go from having an LDT to not having an LDT. Two
+ * mms never share an LDT, so we don't gain anything by checking to
+ * see whether the LDT changed. There's also no guarantee that
+ * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
+ * then prev->context.ldt will also be non-NULL.
+ *
+ * If we really cared, we could optimize the case where prev == next
+ * and we're exiting lazy mode. Most of the time, if this happens,
+ * we don't actually need to reload LDTR, but modify_ldt() is mostly
+ * used by legacy code and emulators where we don't need this level of
+ * performance.
+ *
+ * This uses | instead of || because it generates better code.
+ */
+ if (unlikely((unsigned long)prev->context.ldt |
+ (unsigned long)next->context.ldt))
+ load_mm_ldt(next);
+
+ DEBUG_LOCKS_WARN_ON(preemptible());
+}
static void refresh_ldt_segments(void)
{
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 77854b192fef..02bddfc122a4 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -148,7 +148,7 @@ int machine_kexec_prepare(struct kimage *image)
{
int error;
- set_pages_x(image->control_code_page, 1);
+ set_memory_x((unsigned long)page_address(image->control_code_page), 1);
error = machine_kexec_alloc_page_tables(image);
if (error)
return error;
@@ -162,7 +162,7 @@ int machine_kexec_prepare(struct kimage *image)
*/
void machine_kexec_cleanup(struct kimage *image)
{
- set_pages_nx(image->control_code_page, 1);
+ set_memory_nx((unsigned long)page_address(image->control_code_page), 1);
machine_kexec_free_page_tables(image);
}
@@ -250,15 +250,3 @@ void machine_kexec(struct kimage *image)
__ftrace_enabled_restore(save_ftrace_enabled);
}
-
-void arch_crash_save_vmcoreinfo(void)
-{
-#ifdef CONFIG_NUMA
- VMCOREINFO_SYMBOL(node_data);
- VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
-#endif
-#ifdef CONFIG_X86_PAE
- VMCOREINFO_CONFIG(X86_PAE);
-#endif
-}
-
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 5dcd438ad8f2..ad5cdd6a5f23 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -298,48 +298,6 @@ static void load_segments(void)
);
}
-#ifdef CONFIG_KEXEC_FILE
-/* Update purgatory as needed after various image segments have been prepared */
-static int arch_update_purgatory(struct kimage *image)
-{
- int ret = 0;
-
- if (!image->file_mode)
- return 0;
-
- /* Setup copying of backup region */
- if (image->type == KEXEC_TYPE_CRASH) {
- ret = kexec_purgatory_get_set_symbol(image,
- "purgatory_backup_dest",
- &image->arch.backup_load_addr,
- sizeof(image->arch.backup_load_addr), 0);
- if (ret)
- return ret;
-
- ret = kexec_purgatory_get_set_symbol(image,
- "purgatory_backup_src",
- &image->arch.backup_src_start,
- sizeof(image->arch.backup_src_start), 0);
- if (ret)
- return ret;
-
- ret = kexec_purgatory_get_set_symbol(image,
- "purgatory_backup_sz",
- &image->arch.backup_src_sz,
- sizeof(image->arch.backup_src_sz), 0);
- if (ret)
- return ret;
- }
-
- return ret;
-}
-#else /* !CONFIG_KEXEC_FILE */
-static inline int arch_update_purgatory(struct kimage *image)
-{
- return 0;
-}
-#endif /* CONFIG_KEXEC_FILE */
-
int machine_kexec_prepare(struct kimage *image)
{
unsigned long start_pgtable;
@@ -353,11 +311,6 @@ int machine_kexec_prepare(struct kimage *image)
if (result)
return result;
- /* update purgatory as needed */
- result = arch_update_purgatory(image);
- if (result)
- return result;
-
return 0;
}
@@ -445,25 +398,6 @@ void machine_kexec(struct kimage *image)
__ftrace_enabled_restore(save_ftrace_enabled);
}
-void arch_crash_save_vmcoreinfo(void)
-{
- u64 sme_mask = sme_me_mask;
-
- VMCOREINFO_NUMBER(phys_base);
- VMCOREINFO_SYMBOL(init_top_pgt);
- vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
- pgtable_l5_enabled());
-
-#ifdef CONFIG_NUMA
- VMCOREINFO_SYMBOL(node_data);
- VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
-#endif
- vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
- kaslr_offset());
- VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
- VMCOREINFO_NUMBER(sme_mask);
-}
-
/* arch-dependent functionality related to kexec file-based syscall */
#ifdef CONFIG_KEXEC_FILE
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 3db2252b958d..1547be359d7f 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -34,6 +34,7 @@
#include <linux/notifier.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
+#include <linux/security.h>
#include <asm/cpufeature.h>
#include <asm/msr.h>
@@ -79,6 +80,10 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
int err = 0;
ssize_t bytes = 0;
+ err = security_locked_down(LOCKDOWN_MSR);
+ if (err)
+ return err;
+
if (count % 8)
return -EINVAL; /* Invalid chunk size */
@@ -130,6 +135,9 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
err = -EFAULT;
break;
}
+ err = security_locked_down(LOCKDOWN_MSR);
+ if (err)
+ break;
err = wrmsr_safe_regs_on_cpu(cpu, regs);
if (err)
break;
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 4df7705022b9..54c21d6abd5a 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -104,18 +104,22 @@ static int __init nmi_warning_debugfs(void)
}
fs_initcall(nmi_warning_debugfs);
-static void nmi_max_handler(struct irq_work *w)
+static void nmi_check_duration(struct nmiaction *action, u64 duration)
{
- struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
+ u64 whole_msecs = READ_ONCE(action->max_duration);
int remainder_ns, decimal_msecs;
- u64 whole_msecs = READ_ONCE(a->max_duration);
+
+ if (duration < nmi_longest_ns || duration < action->max_duration)
+ return;
+
+ action->max_duration = duration;
remainder_ns = do_div(whole_msecs, (1000 * 1000));
decimal_msecs = remainder_ns / 1000;
printk_ratelimited(KERN_INFO
"INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
- a->handler, whole_msecs, decimal_msecs);
+ action->handler, whole_msecs, decimal_msecs);
}
static int nmi_handle(unsigned int type, struct pt_regs *regs)
@@ -142,11 +146,7 @@ static int nmi_handle(unsigned int type, struct pt_regs *regs)
delta = sched_clock() - delta;
trace_nmi_handler(a->handler, (int)delta, thishandled);
- if (delta < nmi_longest_ns || delta < a->max_duration)
- continue;
-
- a->max_duration = delta;
- irq_work_queue(&a->irq_work);
+ nmi_check_duration(a, delta);
}
rcu_read_unlock();
@@ -164,8 +164,6 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
if (!action->handler)
return -EINVAL;
- init_irq_work(&action->irq_work, nmi_max_handler);
-
raw_spin_lock_irqsave(&desc->lock, flags);
/*
@@ -512,6 +510,9 @@ NOKPROBE_SYMBOL(is_debug_stack);
dotraplinkage notrace void
do_nmi(struct pt_regs *regs, long error_code)
{
+ if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id()))
+ return;
+
if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
this_cpu_write(nmi_state, NMI_LATCHED);
return;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 0aa6256eedd8..789f5e4f89de 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -311,10 +311,6 @@ struct paravirt_patch_template pv_ops = {
.cpu.read_cr0 = native_read_cr0,
.cpu.write_cr0 = native_write_cr0,
.cpu.write_cr4 = native_write_cr4,
-#ifdef CONFIG_X86_64
- .cpu.read_cr8 = native_read_cr8,
- .cpu.write_cr8 = native_write_cr8,
-#endif
.cpu.wbinvd = native_wbinvd,
.cpu.read_msr = native_read_msr,
.cpu.write_msr = native_write_msr,
@@ -345,8 +341,6 @@ struct paravirt_patch_template pv_ops = {
.cpu.iret = native_iret,
.cpu.swapgs = native_swapgs,
- .cpu.set_iopl_mask = native_set_iopl_mask,
-
.cpu.start_context_switch = paravirt_nop,
.cpu.end_context_switch = paravirt_nop,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
deleted file mode 100644
index 9d4343aa481b..000000000000
--- a/arch/x86/kernel/pci-calgary_64.c
+++ /dev/null
@@ -1,1584 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Derived from arch/powerpc/kernel/iommu.c
- *
- * Copyright IBM Corporation, 2006-2007
- * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us>
- *
- * Author: Jon Mason <jdmason@kudzu.us>
- * Author: Muli Ben-Yehuda <muli@il.ibm.com>
-
- */
-
-#define pr_fmt(fmt) "Calgary: " fmt
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-#include <linux/crash_dump.h>
-#include <linux/dma-mapping.h>
-#include <linux/dma-direct.h>
-#include <linux/bitmap.h>
-#include <linux/pci_ids.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/scatterlist.h>
-#include <linux/iommu-helper.h>
-
-#include <asm/iommu.h>
-#include <asm/calgary.h>
-#include <asm/tce.h>
-#include <asm/pci-direct.h>
-#include <asm/dma.h>
-#include <asm/rio.h>
-#include <asm/bios_ebda.h>
-#include <asm/x86_init.h>
-#include <asm/iommu_table.h>
-
-#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
-int use_calgary __read_mostly = 1;
-#else
-int use_calgary __read_mostly = 0;
-#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */
-
-#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
-#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308
-
-/* register offsets inside the host bridge space */
-#define CALGARY_CONFIG_REG 0x0108
-#define PHB_CSR_OFFSET 0x0110 /* Channel Status */
-#define PHB_PLSSR_OFFSET 0x0120
-#define PHB_CONFIG_RW_OFFSET 0x0160
-#define PHB_IOBASE_BAR_LOW 0x0170
-#define PHB_IOBASE_BAR_HIGH 0x0180
-#define PHB_MEM_1_LOW 0x0190
-#define PHB_MEM_1_HIGH 0x01A0
-#define PHB_IO_ADDR_SIZE 0x01B0
-#define PHB_MEM_1_SIZE 0x01C0
-#define PHB_MEM_ST_OFFSET 0x01D0
-#define PHB_AER_OFFSET 0x0200
-#define PHB_CONFIG_0_HIGH 0x0220
-#define PHB_CONFIG_0_LOW 0x0230
-#define PHB_CONFIG_0_END 0x0240
-#define PHB_MEM_2_LOW 0x02B0
-#define PHB_MEM_2_HIGH 0x02C0
-#define PHB_MEM_2_SIZE_HIGH 0x02D0
-#define PHB_MEM_2_SIZE_LOW 0x02E0
-#define PHB_DOSHOLE_OFFSET 0x08E0
-
-/* CalIOC2 specific */
-#define PHB_SAVIOR_L2 0x0DB0
-#define PHB_PAGE_MIG_CTRL 0x0DA8
-#define PHB_PAGE_MIG_DEBUG 0x0DA0
-#define PHB_ROOT_COMPLEX_STATUS 0x0CB0
-
-/* PHB_CONFIG_RW */
-#define PHB_TCE_ENABLE 0x20000000
-#define PHB_SLOT_DISABLE 0x1C000000
-#define PHB_DAC_DISABLE 0x01000000
-#define PHB_MEM2_ENABLE 0x00400000
-#define PHB_MCSR_ENABLE 0x00100000
-/* TAR (Table Address Register) */
-#define TAR_SW_BITS 0x0000ffffffff800fUL
-#define TAR_VALID 0x0000000000000008UL
-/* CSR (Channel/DMA Status Register) */
-#define CSR_AGENT_MASK 0xffe0ffff
-/* CCR (Calgary Configuration Register) */
-#define CCR_2SEC_TIMEOUT 0x000000000000000EUL
-/* PMCR/PMDR (Page Migration Control/Debug Registers */
-#define PMR_SOFTSTOP 0x80000000
-#define PMR_SOFTSTOPFAULT 0x40000000
-#define PMR_HARDSTOP 0x20000000
-
-/*
- * The maximum PHB bus number.
- * x3950M2 (rare): 8 chassis, 48 PHBs per chassis = 384
- * x3950M2: 4 chassis, 48 PHBs per chassis = 192
- * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256
- * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128
- */
-#define MAX_PHB_BUS_NUM 256
-
-#define PHBS_PER_CALGARY 4
-
-/* register offsets in Calgary's internal register space */
-static const unsigned long tar_offsets[] = {
- 0x0580 /* TAR0 */,
- 0x0588 /* TAR1 */,
- 0x0590 /* TAR2 */,
- 0x0598 /* TAR3 */
-};
-
-static const unsigned long split_queue_offsets[] = {
- 0x4870 /* SPLIT QUEUE 0 */,
- 0x5870 /* SPLIT QUEUE 1 */,
- 0x6870 /* SPLIT QUEUE 2 */,
- 0x7870 /* SPLIT QUEUE 3 */
-};
-
-static const unsigned long phb_offsets[] = {
- 0x8000 /* PHB0 */,
- 0x9000 /* PHB1 */,
- 0xA000 /* PHB2 */,
- 0xB000 /* PHB3 */
-};
-
-/* PHB debug registers */
-
-static const unsigned long phb_debug_offsets[] = {
- 0x4000 /* PHB 0 DEBUG */,
- 0x5000 /* PHB 1 DEBUG */,
- 0x6000 /* PHB 2 DEBUG */,
- 0x7000 /* PHB 3 DEBUG */
-};
-
-/*
- * STUFF register for each debug PHB,
- * byte 1 = start bus number, byte 2 = end bus number
- */
-
-#define PHB_DEBUG_STUFF_OFFSET 0x0020
-
-unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
-static int translate_empty_slots __read_mostly = 0;
-static int calgary_detected __read_mostly = 0;
-
-static struct rio_table_hdr *rio_table_hdr __initdata;
-static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
-static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata;
-
-struct calgary_bus_info {
- void *tce_space;
- unsigned char translation_disabled;
- signed char phbid;
- void __iomem *bbar;
-};
-
-static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
-static void calgary_tce_cache_blast(struct iommu_table *tbl);
-static void calgary_dump_error_regs(struct iommu_table *tbl);
-static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
-static void calioc2_tce_cache_blast(struct iommu_table *tbl);
-static void calioc2_dump_error_regs(struct iommu_table *tbl);
-static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
-static void get_tce_space_from_tar(void);
-
-static const struct cal_chipset_ops calgary_chip_ops = {
- .handle_quirks = calgary_handle_quirks,
- .tce_cache_blast = calgary_tce_cache_blast,
- .dump_error_regs = calgary_dump_error_regs
-};
-
-static const struct cal_chipset_ops calioc2_chip_ops = {
- .handle_quirks = calioc2_handle_quirks,
- .tce_cache_blast = calioc2_tce_cache_blast,
- .dump_error_regs = calioc2_dump_error_regs
-};
-
-static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
-
-static inline int translation_enabled(struct iommu_table *tbl)
-{
- /* only PHBs with translation enabled have an IOMMU table */
- return (tbl != NULL);
-}
-
-static void iommu_range_reserve(struct iommu_table *tbl,
- unsigned long start_addr, unsigned int npages)
-{
- unsigned long index;
- unsigned long end;
- unsigned long flags;
-
- index = start_addr >> PAGE_SHIFT;
-
- /* bail out if we're asked to reserve a region we don't cover */
- if (index >= tbl->it_size)
- return;
-
- end = index + npages;
- if (end > tbl->it_size) /* don't go off the table */
- end = tbl->it_size;
-
- spin_lock_irqsave(&tbl->it_lock, flags);
-
- bitmap_set(tbl->it_map, index, npages);
-
- spin_unlock_irqrestore(&tbl->it_lock, flags);
-}
-
-static unsigned long iommu_range_alloc(struct device *dev,
- struct iommu_table *tbl,
- unsigned int npages)
-{
- unsigned long flags;
- unsigned long offset;
- unsigned long boundary_size;
-
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
-
- BUG_ON(npages == 0);
-
- spin_lock_irqsave(&tbl->it_lock, flags);
-
- offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint,
- npages, 0, boundary_size, 0);
- if (offset == ~0UL) {
- tbl->chip_ops->tce_cache_blast(tbl);
-
- offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,
- npages, 0, boundary_size, 0);
- if (offset == ~0UL) {
- pr_warn("IOMMU full\n");
- spin_unlock_irqrestore(&tbl->it_lock, flags);
- if (panic_on_overflow)
- panic("Calgary: fix the allocator.\n");
- else
- return DMA_MAPPING_ERROR;
- }
- }
-
- tbl->it_hint = offset + npages;
- BUG_ON(tbl->it_hint > tbl->it_size);
-
- spin_unlock_irqrestore(&tbl->it_lock, flags);
-
- return offset;
-}
-
-static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
- void *vaddr, unsigned int npages, int direction)
-{
- unsigned long entry;
- dma_addr_t ret;
-
- entry = iommu_range_alloc(dev, tbl, npages);
- if (unlikely(entry == DMA_MAPPING_ERROR)) {
- pr_warn("failed to allocate %u pages in iommu %p\n",
- npages, tbl);
- return DMA_MAPPING_ERROR;
- }
-
- /* set the return dma address */
- ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
-
- /* put the TCEs in the HW table */
- tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
- direction);
- return ret;
-}
-
-static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
- unsigned int npages)
-{
- unsigned long entry;
- unsigned long flags;
-
- /* were we called with bad_dma_address? */
- if (unlikely(dma_addr == DMA_MAPPING_ERROR)) {
- WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
- "address 0x%Lx\n", dma_addr);
- return;
- }
-
- entry = dma_addr >> PAGE_SHIFT;
-
- BUG_ON(entry + npages > tbl->it_size);
-
- tce_free(tbl, entry, npages);
-
- spin_lock_irqsave(&tbl->it_lock, flags);
-
- bitmap_clear(tbl->it_map, entry, npages);
-
- spin_unlock_irqrestore(&tbl->it_lock, flags);
-}
-
-static inline struct iommu_table *find_iommu_table(struct device *dev)
-{
- struct pci_dev *pdev;
- struct pci_bus *pbus;
- struct iommu_table *tbl;
-
- pdev = to_pci_dev(dev);
-
- /* search up the device tree for an iommu */
- pbus = pdev->bus;
- do {
- tbl = pci_iommu(pbus);
- if (tbl && tbl->it_busno == pbus->number)
- break;
- tbl = NULL;
- pbus = pbus->parent;
- } while (pbus);
-
- BUG_ON(tbl && (tbl->it_busno != pbus->number));
-
- return tbl;
-}
-
-static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
- int nelems,enum dma_data_direction dir,
- unsigned long attrs)
-{
- struct iommu_table *tbl = find_iommu_table(dev);
- struct scatterlist *s;
- int i;
-
- if (!translation_enabled(tbl))
- return;
-
- for_each_sg(sglist, s, nelems, i) {
- unsigned int npages;
- dma_addr_t dma = s->dma_address;
- unsigned int dmalen = s->dma_length;
-
- if (dmalen == 0)
- break;
-
- npages = iommu_num_pages(dma, dmalen, PAGE_SIZE);
- iommu_free(tbl, dma, npages);
- }
-}
-
-static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
- int nelems, enum dma_data_direction dir,
- unsigned long attrs)
-{
- struct iommu_table *tbl = find_iommu_table(dev);
- struct scatterlist *s;
- unsigned long vaddr;
- unsigned int npages;
- unsigned long entry;
- int i;
-
- for_each_sg(sg, s, nelems, i) {
- BUG_ON(!sg_page(s));
-
- vaddr = (unsigned long) sg_virt(s);
- npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
-
- entry = iommu_range_alloc(dev, tbl, npages);
- if (entry == DMA_MAPPING_ERROR) {
- /* makes sure unmap knows to stop */
- s->dma_length = 0;
- goto error;
- }
-
- s->dma_address = (entry << PAGE_SHIFT) | s->offset;
-
- /* insert into HW table */
- tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir);
-
- s->dma_length = s->length;
- }
-
- return nelems;
-error:
- calgary_unmap_sg(dev, sg, nelems, dir, 0);
- for_each_sg(sg, s, nelems, i) {
- sg->dma_address = DMA_MAPPING_ERROR;
- sg->dma_length = 0;
- }
- return 0;
-}
-
-static dma_addr_t calgary_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- unsigned long attrs)
-{
- void *vaddr = page_address(page) + offset;
- unsigned long uaddr;
- unsigned int npages;
- struct iommu_table *tbl = find_iommu_table(dev);
-
- uaddr = (unsigned long)vaddr;
- npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
-
- return iommu_alloc(dev, tbl, vaddr, npages, dir);
-}
-
-static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
- size_t size, enum dma_data_direction dir,
- unsigned long attrs)
-{
- struct iommu_table *tbl = find_iommu_table(dev);
- unsigned int npages;
-
- npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
- iommu_free(tbl, dma_addr, npages);
-}
-
-static void* calgary_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs)
-{
- void *ret = NULL;
- dma_addr_t mapping;
- unsigned int npages, order;
- struct iommu_table *tbl = find_iommu_table(dev);
-
- size = PAGE_ALIGN(size); /* size rounded up to full pages */
- npages = size >> PAGE_SHIFT;
- order = get_order(size);
-
- /* alloc enough pages (and possibly more) */
- ret = (void *)__get_free_pages(flag, order);
- if (!ret)
- goto error;
- memset(ret, 0, size);
-
- /* set up tces to cover the allocated range */
- mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
- if (mapping == DMA_MAPPING_ERROR)
- goto free;
- *dma_handle = mapping;
- return ret;
-free:
- free_pages((unsigned long)ret, get_order(size));
- ret = NULL;
-error:
- return ret;
-}
-
-static void calgary_free_coherent(struct device *dev, size_t size,
- void *vaddr, dma_addr_t dma_handle,
- unsigned long attrs)
-{
- unsigned int npages;
- struct iommu_table *tbl = find_iommu_table(dev);
-
- size = PAGE_ALIGN(size);
- npages = size >> PAGE_SHIFT;
-
- iommu_free(tbl, dma_handle, npages);
- free_pages((unsigned long)vaddr, get_order(size));
-}
-
-static const struct dma_map_ops calgary_dma_ops = {
- .alloc = calgary_alloc_coherent,
- .free = calgary_free_coherent,
- .map_sg = calgary_map_sg,
- .unmap_sg = calgary_unmap_sg,
- .map_page = calgary_map_page,
- .unmap_page = calgary_unmap_page,
- .dma_supported = dma_direct_supported,
-};
-
-static inline void __iomem * busno_to_bbar(unsigned char num)
-{
- return bus_info[num].bbar;
-}
-
-static inline int busno_to_phbid(unsigned char num)
-{
- return bus_info[num].phbid;
-}
-
-static inline unsigned long split_queue_offset(unsigned char num)
-{
- size_t idx = busno_to_phbid(num);
-
- return split_queue_offsets[idx];
-}
-
-static inline unsigned long tar_offset(unsigned char num)
-{
- size_t idx = busno_to_phbid(num);
-
- return tar_offsets[idx];
-}
-
-static inline unsigned long phb_offset(unsigned char num)
-{
- size_t idx = busno_to_phbid(num);
-
- return phb_offsets[idx];
-}
-
-static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset)
-{
- unsigned long target = ((unsigned long)bar) | offset;
- return (void __iomem*)target;
-}
-
-static inline int is_calioc2(unsigned short device)
-{
- return (device == PCI_DEVICE_ID_IBM_CALIOC2);
-}
-
-static inline int is_calgary(unsigned short device)
-{
- return (device == PCI_DEVICE_ID_IBM_CALGARY);
-}
-
-static inline int is_cal_pci_dev(unsigned short device)
-{
- return (is_calgary(device) || is_calioc2(device));
-}
-
-static void calgary_tce_cache_blast(struct iommu_table *tbl)
-{
- u64 val;
- u32 aer;
- int i = 0;
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
-
- /* disable arbitration on the bus */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
- aer = readl(target);
- writel(0, target);
-
- /* read plssr to ensure it got there */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
- val = readl(target);
-
- /* poll split queues until all DMA activity is done */
- target = calgary_reg(bbar, split_queue_offset(tbl->it_busno));
- do {
- val = readq(target);
- i++;
- } while ((val & 0xff) != 0xff && i < 100);
- if (i == 100)
- pr_warn("PCI bus not quiesced, continuing anyway\n");
-
- /* invalidate TCE cache */
- target = calgary_reg(bbar, tar_offset(tbl->it_busno));
- writeq(tbl->tar_val, target);
-
- /* enable arbitration */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
- writel(aer, target);
- (void)readl(target); /* flush */
-}
-
-static void calioc2_tce_cache_blast(struct iommu_table *tbl)
-{
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
- u64 val64;
- u32 val;
- int i = 0;
- int count = 1;
- unsigned char bus = tbl->it_busno;
-
-begin:
- printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast "
- "sequence - count %d\n", bus, count);
-
- /* 1. using the Page Migration Control reg set SoftStop */
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target);
- val |= PMR_SOFTSTOP;
- printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target);
- writel(cpu_to_be32(val), target);
-
- /* 2. poll split queues until all DMA activity is done */
- printk(KERN_DEBUG "2a. starting to poll split queues\n");
- target = calgary_reg(bbar, split_queue_offset(bus));
- do {
- val64 = readq(target);
- i++;
- } while ((val64 & 0xff) != 0xff && i < 100);
- if (i == 100)
- pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n");
-
- /* 3. poll Page Migration DEBUG for SoftStopFault */
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target);
-
- /* 4. if SoftStopFault - goto (1) */
- if (val & PMR_SOFTSTOPFAULT) {
- if (++count < 100)
- goto begin;
- else {
- pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n");
- return; /* pray for the best */
- }
- }
-
- /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target);
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target);
-
- /* 6. invalidate TCE cache */
- printk(KERN_DEBUG "6. invalidating TCE cache\n");
- target = calgary_reg(bbar, tar_offset(bus));
- writeq(tbl->tar_val, target);
-
- /* 7. Re-read PMCR */
- printk(KERN_DEBUG "7a. Re-reading PMCR\n");
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target);
-
- /* 8. Remove HardStop */
- printk(KERN_DEBUG "8a. removing HardStop from PMCR\n");
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- val = 0;
- printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target);
- writel(cpu_to_be32(val), target);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target);
-}
-
-static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
- u64 limit)
-{
- unsigned int numpages;
-
- limit = limit | 0xfffff;
- limit++;
-
- numpages = ((limit - start) >> PAGE_SHIFT);
- iommu_range_reserve(pci_iommu(dev->bus), start, numpages);
-}
-
-static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
-{
- void __iomem *target;
- u64 low, high, sizelow;
- u64 start, limit;
- struct iommu_table *tbl = pci_iommu(dev->bus);
- unsigned char busnum = dev->bus->number;
- void __iomem *bbar = tbl->bbar;
-
- /* peripheral MEM_1 region */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW);
- low = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH);
- high = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE);
- sizelow = be32_to_cpu(readl(target));
-
- start = (high << 32) | low;
- limit = sizelow;
-
- calgary_reserve_mem_region(dev, start, limit);
-}
-
-static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
-{
- void __iomem *target;
- u32 val32;
- u64 low, high, sizelow, sizehigh;
- u64 start, limit;
- struct iommu_table *tbl = pci_iommu(dev->bus);
- unsigned char busnum = dev->bus->number;
- void __iomem *bbar = tbl->bbar;
-
- /* is it enabled? */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- if (!(val32 & PHB_MEM2_ENABLE))
- return;
-
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW);
- low = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH);
- high = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW);
- sizelow = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH);
- sizehigh = be32_to_cpu(readl(target));
-
- start = (high << 32) | low;
- limit = (sizehigh << 32) | sizelow;
-
- calgary_reserve_mem_region(dev, start, limit);
-}
-
-/*
- * some regions of the IO address space do not get translated, so we
- * must not give devices IO addresses in those regions. The regions
- * are the 640KB-1MB region and the two PCI peripheral memory holes.
- * Reserve all of them in the IOMMU bitmap to avoid giving them out
- * later.
- */
-static void __init calgary_reserve_regions(struct pci_dev *dev)
-{
- unsigned int npages;
- u64 start;
- struct iommu_table *tbl = pci_iommu(dev->bus);
-
- /* avoid the BIOS/VGA first 640KB-1MB region */
- /* for CalIOC2 - avoid the entire first MB */
- if (is_calgary(dev->device)) {
- start = (640 * 1024);
- npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
- } else { /* calioc2 */
- start = 0;
- npages = (1 * 1024 * 1024) >> PAGE_SHIFT;
- }
- iommu_range_reserve(tbl, start, npages);
-
- /* reserve the two PCI peripheral memory regions in IO space */
- calgary_reserve_peripheral_mem_1(dev);
- calgary_reserve_peripheral_mem_2(dev);
-}
-
-static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
-{
- u64 val64;
- u64 table_phys;
- void __iomem *target;
- int ret;
- struct iommu_table *tbl;
-
- /* build TCE tables for each PHB */
- ret = build_tce_table(dev, bbar);
- if (ret)
- return ret;
-
- tbl = pci_iommu(dev->bus);
- tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
-
- if (is_kdump_kernel())
- calgary_init_bitmap_from_tce_table(tbl);
- else
- tce_free(tbl, 0, tbl->it_size);
-
- if (is_calgary(dev->device))
- tbl->chip_ops = &calgary_chip_ops;
- else if (is_calioc2(dev->device))
- tbl->chip_ops = &calioc2_chip_ops;
- else
- BUG();
-
- calgary_reserve_regions(dev);
-
- /* set TARs for each PHB */
- target = calgary_reg(bbar, tar_offset(dev->bus->number));
- val64 = be64_to_cpu(readq(target));
-
- /* zero out all TAR bits under sw control */
- val64 &= ~TAR_SW_BITS;
- table_phys = (u64)__pa(tbl->it_base);
-
- val64 |= table_phys;
-
- BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M);
- val64 |= (u64) specified_table_size;
-
- tbl->tar_val = cpu_to_be64(val64);
-
- writeq(tbl->tar_val, target);
- readq(target); /* flush */
-
- return 0;
-}
-
-static void __init calgary_free_bus(struct pci_dev *dev)
-{
- u64 val64;
- struct iommu_table *tbl = pci_iommu(dev->bus);
- void __iomem *target;
- unsigned int bitmapsz;
-
- target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
- val64 = be64_to_cpu(readq(target));
- val64 &= ~TAR_SW_BITS;
- writeq(cpu_to_be64(val64), target);
- readq(target); /* flush */
-
- bitmapsz = tbl->it_size / BITS_PER_BYTE;
- free_pages((unsigned long)tbl->it_map, get_order(bitmapsz));
- tbl->it_map = NULL;
-
- kfree(tbl);
-
- set_pci_iommu(dev->bus, NULL);
-
- /* Can't free bootmem allocated memory after system is up :-( */
- bus_info[dev->bus->number].tce_space = NULL;
-}
-
-static void calgary_dump_error_regs(struct iommu_table *tbl)
-{
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
- u32 csr, plssr;
-
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
- csr = be32_to_cpu(readl(target));
-
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
- plssr = be32_to_cpu(readl(target));
-
- /* If no error, the agent ID in the CSR is not valid */
- pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n",
- tbl->it_busno, csr, plssr);
-}
-
-static void calioc2_dump_error_regs(struct iommu_table *tbl)
-{
- void __iomem *bbar = tbl->bbar;
- u32 csr, csmr, plssr, mck, rcstat;
- void __iomem *target;
- unsigned long phboff = phb_offset(tbl->it_busno);
- unsigned long erroff;
- u32 errregs[7];
- int i;
-
- /* dump CSR */
- target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET);
- csr = be32_to_cpu(readl(target));
- /* dump PLSSR */
- target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET);
- plssr = be32_to_cpu(readl(target));
- /* dump CSMR */
- target = calgary_reg(bbar, phboff | 0x290);
- csmr = be32_to_cpu(readl(target));
- /* dump mck */
- target = calgary_reg(bbar, phboff | 0x800);
- mck = be32_to_cpu(readl(target));
-
- pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno);
-
- pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
- csr, plssr, csmr, mck);
-
- /* dump rest of error regs */
- pr_emerg("");
- for (i = 0; i < ARRAY_SIZE(errregs); i++) {
- /* err regs are at 0x810 - 0x870 */
- erroff = (0x810 + (i * 0x10));
- target = calgary_reg(bbar, phboff | erroff);
- errregs[i] = be32_to_cpu(readl(target));
- pr_cont("0x%08x@0x%lx ", errregs[i], erroff);
- }
- pr_cont("\n");
-
- /* root complex status */
- target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
- rcstat = be32_to_cpu(readl(target));
- printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat,
- PHB_ROOT_COMPLEX_STATUS);
-}
-
-static void calgary_watchdog(struct timer_list *t)
-{
- struct iommu_table *tbl = from_timer(tbl, t, watchdog_timer);
- void __iomem *bbar = tbl->bbar;
- u32 val32;
- void __iomem *target;
-
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
- val32 = be32_to_cpu(readl(target));
-
- /* If no error, the agent ID in the CSR is not valid */
- if (val32 & CSR_AGENT_MASK) {
- tbl->chip_ops->dump_error_regs(tbl);
-
- /* reset error */
- writel(0, target);
-
- /* Disable bus that caused the error */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) |
- PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- val32 |= PHB_SLOT_DISABLE;
- writel(cpu_to_be32(val32), target);
- readl(target); /* flush */
- } else {
- /* Reset the timer */
- mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ);
- }
-}
-
-static void __init calgary_set_split_completion_timeout(void __iomem *bbar,
- unsigned char busnum, unsigned long timeout)
-{
- u64 val64;
- void __iomem *target;
- unsigned int phb_shift = ~0; /* silence gcc */
- u64 mask;
-
- switch (busno_to_phbid(busnum)) {
- case 0: phb_shift = (63 - 19);
- break;
- case 1: phb_shift = (63 - 23);
- break;
- case 2: phb_shift = (63 - 27);
- break;
- case 3: phb_shift = (63 - 35);
- break;
- default:
- BUG_ON(busno_to_phbid(busnum));
- }
-
- target = calgary_reg(bbar, CALGARY_CONFIG_REG);
- val64 = be64_to_cpu(readq(target));
-
- /* zero out this PHB's timer bits */
- mask = ~(0xFUL << phb_shift);
- val64 &= mask;
- val64 |= (timeout << phb_shift);
- writeq(cpu_to_be64(val64), target);
- readq(target); /* flush */
-}
-
-static void __init calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
-{
- unsigned char busnum = dev->bus->number;
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
- u32 val;
-
- /*
- * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1
- */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2);
- val = cpu_to_be32(readl(target));
- val |= 0x00800000;
- writel(cpu_to_be32(val), target);
-}
-
-static void __init calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
-{
- unsigned char busnum = dev->bus->number;
-
- /*
- * Give split completion a longer timeout on bus 1 for aic94xx
- * http://bugzilla.kernel.org/show_bug.cgi?id=7180
- */
- if (is_calgary(dev->device) && (busnum == 1))
- calgary_set_split_completion_timeout(tbl->bbar, busnum,
- CCR_2SEC_TIMEOUT);
-}
-
-static void __init calgary_enable_translation(struct pci_dev *dev)
-{
- u32 val32;
- unsigned char busnum;
- void __iomem *target;
- void __iomem *bbar;
- struct iommu_table *tbl;
-
- busnum = dev->bus->number;
- tbl = pci_iommu(dev->bus);
- bbar = tbl->bbar;
-
- /* enable TCE in PHB Config Register */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE;
-
- printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n",
- (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ?
- "Calgary" : "CalIOC2", busnum);
- printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this "
- "bus.\n");
-
- writel(cpu_to_be32(val32), target);
- readl(target); /* flush */
-
- timer_setup(&tbl->watchdog_timer, calgary_watchdog, 0);
- mod_timer(&tbl->watchdog_timer, jiffies);
-}
-
-static void __init calgary_disable_translation(struct pci_dev *dev)
-{
- u32 val32;
- unsigned char busnum;
- void __iomem *target;
- void __iomem *bbar;
- struct iommu_table *tbl;
-
- busnum = dev->bus->number;
- tbl = pci_iommu(dev->bus);
- bbar = tbl->bbar;
-
- /* disable TCE in PHB Config Register */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE);
-
- printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum);
- writel(cpu_to_be32(val32), target);
- readl(target); /* flush */
-
- del_timer_sync(&tbl->watchdog_timer);
-}
-
-static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
-{
- pci_dev_get(dev);
- set_pci_iommu(dev->bus, NULL);
-
- /* is the device behind a bridge? */
- if (dev->bus->parent)
- dev->bus->parent->self = dev;
- else
- dev->bus->self = dev;
-}
-
-static int __init calgary_init_one(struct pci_dev *dev)
-{
- void __iomem *bbar;
- struct iommu_table *tbl;
- int ret;
-
- bbar = busno_to_bbar(dev->bus->number);
- ret = calgary_setup_tar(dev, bbar);
- if (ret)
- goto done;
-
- pci_dev_get(dev);
-
- if (dev->bus->parent) {
- if (dev->bus->parent->self)
- printk(KERN_WARNING "Calgary: IEEEE, dev %p has "
- "bus->parent->self!\n", dev);
- dev->bus->parent->self = dev;
- } else
- dev->bus->self = dev;
-
- tbl = pci_iommu(dev->bus);
- tbl->chip_ops->handle_quirks(tbl, dev);
-
- calgary_enable_translation(dev);
-
- return 0;
-
-done:
- return ret;
-}
-
-static int __init calgary_locate_bbars(void)
-{
- int ret;
- int rioidx, phb, bus;
- void __iomem *bbar;
- void __iomem *target;
- unsigned long offset;
- u8 start_bus, end_bus;
- u32 val;
-
- ret = -ENODATA;
- for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) {
- struct rio_detail *rio = rio_devs[rioidx];
-
- if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY))
- continue;
-
- /* map entire 1MB of Calgary config space */
- bbar = ioremap_nocache(rio->BBAR, 1024 * 1024);
- if (!bbar)
- goto error;
-
- for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
- offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET;
- target = calgary_reg(bbar, offset);
-
- val = be32_to_cpu(readl(target));
-
- start_bus = (u8)((val & 0x00FF0000) >> 16);
- end_bus = (u8)((val & 0x0000FF00) >> 8);
-
- if (end_bus) {
- for (bus = start_bus; bus <= end_bus; bus++) {
- bus_info[bus].bbar = bbar;
- bus_info[bus].phbid = phb;
- }
- } else {
- bus_info[start_bus].bbar = bbar;
- bus_info[start_bus].phbid = phb;
- }
- }
- }
-
- return 0;
-
-error:
- /* scan bus_info and iounmap any bbars we previously ioremap'd */
- for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++)
- if (bus_info[bus].bbar)
- iounmap(bus_info[bus].bbar);
-
- return ret;
-}
-
-static int __init calgary_init(void)
-{
- int ret;
- struct pci_dev *dev = NULL;
- struct calgary_bus_info *info;
-
- ret = calgary_locate_bbars();
- if (ret)
- return ret;
-
- /* Purely for kdump kernel case */
- if (is_kdump_kernel())
- get_tce_space_from_tar();
-
- do {
- dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
- if (!dev)
- break;
- if (!is_cal_pci_dev(dev->device))
- continue;
-
- info = &bus_info[dev->bus->number];
- if (info->translation_disabled) {
- calgary_init_one_nontraslated(dev);
- continue;
- }
-
- if (!info->tce_space && !translate_empty_slots)
- continue;
-
- ret = calgary_init_one(dev);
- if (ret)
- goto error;
- } while (1);
-
- dev = NULL;
- for_each_pci_dev(dev) {
- struct iommu_table *tbl;
-
- tbl = find_iommu_table(&dev->dev);
-
- if (translation_enabled(tbl))
- dev->dev.dma_ops = &calgary_dma_ops;
- }
-
- return ret;
-
-error:
- do {
- dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
- if (!dev)
- break;
- if (!is_cal_pci_dev(dev->device))
- continue;
-
- info = &bus_info[dev->bus->number];
- if (info->translation_disabled) {
- pci_dev_put(dev);
- continue;
- }
- if (!info->tce_space && !translate_empty_slots)
- continue;
-
- calgary_disable_translation(dev);
- calgary_free_bus(dev);
- pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
- dev->dev.dma_ops = NULL;
- } while (1);
-
- return ret;
-}
-
-static inline int __init determine_tce_table_size(void)
-{
- int ret;
-
- if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
- return specified_table_size;
-
- if (is_kdump_kernel() && saved_max_pfn) {
- /*
- * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
- * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
- * larger table size has twice as many entries, so shift the
- * max ram address by 13 to divide by 8K and then look at the
- * order of the result to choose between 0-7.
- */
- ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13);
- if (ret > TCE_TABLE_SIZE_8M)
- ret = TCE_TABLE_SIZE_8M;
- } else {
- /*
- * Use 8M by default (suggested by Muli) if it's not
- * kdump kernel and saved_max_pfn isn't set.
- */
- ret = TCE_TABLE_SIZE_8M;
- }
-
- return ret;
-}
-
-static int __init build_detail_arrays(void)
-{
- unsigned long ptr;
- unsigned numnodes, i;
- int scal_detail_size, rio_detail_size;
-
- numnodes = rio_table_hdr->num_scal_dev;
- if (numnodes > MAX_NUMNODES){
- printk(KERN_WARNING
- "Calgary: MAX_NUMNODES too low! Defined as %d, "
- "but system has %d nodes.\n",
- MAX_NUMNODES, numnodes);
- return -ENODEV;
- }
-
- switch (rio_table_hdr->version){
- case 2:
- scal_detail_size = 11;
- rio_detail_size = 13;
- break;
- case 3:
- scal_detail_size = 12;
- rio_detail_size = 15;
- break;
- default:
- printk(KERN_WARNING
- "Calgary: Invalid Rio Grande Table Version: %d\n",
- rio_table_hdr->version);
- return -EPROTO;
- }
-
- ptr = ((unsigned long)rio_table_hdr) + 3;
- for (i = 0; i < numnodes; i++, ptr += scal_detail_size)
- scal_devs[i] = (struct scal_detail *)ptr;
-
- for (i = 0; i < rio_table_hdr->num_rio_dev;
- i++, ptr += rio_detail_size)
- rio_devs[i] = (struct rio_detail *)ptr;
-
- return 0;
-}
-
-static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
-{
- int dev;
- u32 val;
-
- if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
- /*
- * FIXME: properly scan for devices across the
- * PCI-to-PCI bridge on every CalIOC2 port.
- */
- return 1;
- }
-
- for (dev = 1; dev < 8; dev++) {
- val = read_pci_config(bus, dev, 0, 0);
- if (val != 0xffffffff)
- break;
- }
- return (val != 0xffffffff);
-}
-
-/*
- * calgary_init_bitmap_from_tce_table():
- * Function for kdump case. In the second/kdump kernel initialize
- * the bitmap based on the tce table entries obtained from first kernel
- */
-static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
-{
- u64 *tp;
- unsigned int index;
- tp = ((u64 *)tbl->it_base);
- for (index = 0 ; index < tbl->it_size; index++) {
- if (*tp != 0x0)
- set_bit(index, tbl->it_map);
- tp++;
- }
-}
-
-/*
- * get_tce_space_from_tar():
- * Function for kdump case. Get the tce tables from first kernel
- * by reading the contents of the base address register of calgary iommu
- */
-static void __init get_tce_space_from_tar(void)
-{
- int bus;
- void __iomem *target;
- unsigned long tce_space;
-
- for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
- struct calgary_bus_info *info = &bus_info[bus];
- unsigned short pci_device;
- u32 val;
-
- val = read_pci_config(bus, 0, 0, 0);
- pci_device = (val & 0xFFFF0000) >> 16;
-
- if (!is_cal_pci_dev(pci_device))
- continue;
- if (info->translation_disabled)
- continue;
-
- if (calgary_bus_has_devices(bus, pci_device) ||
- translate_empty_slots) {
- target = calgary_reg(bus_info[bus].bbar,
- tar_offset(bus));
- tce_space = be64_to_cpu(readq(target));
- tce_space = tce_space & TAR_SW_BITS;
-
- tce_space = tce_space & (~specified_table_size);
- info->tce_space = (u64 *)__va(tce_space);
- }
- }
- return;
-}
-
-static int __init calgary_iommu_init(void)
-{
- int ret;
-
- /* ok, we're trying to use Calgary - let's roll */
- printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
-
- ret = calgary_init();
- if (ret) {
- printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
- "falling back to no_iommu\n", ret);
- return ret;
- }
-
- return 0;
-}
-
-int __init detect_calgary(void)
-{
- int bus;
- void *tbl;
- int calgary_found = 0;
- unsigned long ptr;
- unsigned int offset, prev_offset;
- int ret;
-
- /*
- * if the user specified iommu=off or iommu=soft or we found
- * another HW IOMMU already, bail out.
- */
- if (no_iommu || iommu_detected)
- return -ENODEV;
-
- if (!use_calgary)
- return -ENODEV;
-
- if (!early_pci_allowed())
- return -ENODEV;
-
- printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
-
- ptr = (unsigned long)phys_to_virt(get_bios_ebda());
-
- rio_table_hdr = NULL;
- prev_offset = 0;
- offset = 0x180;
- /*
- * The next offset is stored in the 1st word.
- * Only parse up until the offset increases:
- */
- while (offset > prev_offset) {
- /* The block id is stored in the 2nd word */
- if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
- /* set the pointer past the offset & block id */
- rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
- break;
- }
- prev_offset = offset;
- offset = *((unsigned short *)(ptr + offset));
- }
- if (!rio_table_hdr) {
- printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
- "in EBDA - bailing!\n");
- return -ENODEV;
- }
-
- ret = build_detail_arrays();
- if (ret) {
- printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
- return -ENOMEM;
- }
-
- specified_table_size = determine_tce_table_size();
-
- for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
- struct calgary_bus_info *info = &bus_info[bus];
- unsigned short pci_device;
- u32 val;
-
- val = read_pci_config(bus, 0, 0, 0);
- pci_device = (val & 0xFFFF0000) >> 16;
-
- if (!is_cal_pci_dev(pci_device))
- continue;
-
- if (info->translation_disabled)
- continue;
-
- if (calgary_bus_has_devices(bus, pci_device) ||
- translate_empty_slots) {
- /*
- * If it is kdump kernel, find and use tce tables
- * from first kernel, else allocate tce tables here
- */
- if (!is_kdump_kernel()) {
- tbl = alloc_tce_table();
- if (!tbl)
- goto cleanup;
- info->tce_space = tbl;
- }
- calgary_found = 1;
- }
- }
-
- printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n",
- calgary_found ? "found" : "not found");
-
- if (calgary_found) {
- iommu_detected = 1;
- calgary_detected = 1;
- printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
- printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
- specified_table_size);
-
- x86_init.iommu.iommu_init = calgary_iommu_init;
- }
- return calgary_found;
-
-cleanup:
- for (--bus; bus >= 0; --bus) {
- struct calgary_bus_info *info = &bus_info[bus];
-
- if (info->tce_space)
- free_tce_table(info->tce_space);
- }
- return -ENOMEM;
-}
-
-static int __init calgary_parse_options(char *p)
-{
- unsigned int bridge;
- unsigned long val;
- size_t len;
- ssize_t ret;
-
- while (*p) {
- if (!strncmp(p, "64k", 3))
- specified_table_size = TCE_TABLE_SIZE_64K;
- else if (!strncmp(p, "128k", 4))
- specified_table_size = TCE_TABLE_SIZE_128K;
- else if (!strncmp(p, "256k", 4))
- specified_table_size = TCE_TABLE_SIZE_256K;
- else if (!strncmp(p, "512k", 4))
- specified_table_size = TCE_TABLE_SIZE_512K;
- else if (!strncmp(p, "1M", 2))
- specified_table_size = TCE_TABLE_SIZE_1M;
- else if (!strncmp(p, "2M", 2))
- specified_table_size = TCE_TABLE_SIZE_2M;
- else if (!strncmp(p, "4M", 2))
- specified_table_size = TCE_TABLE_SIZE_4M;
- else if (!strncmp(p, "8M", 2))
- specified_table_size = TCE_TABLE_SIZE_8M;
-
- len = strlen("translate_empty_slots");
- if (!strncmp(p, "translate_empty_slots", len))
- translate_empty_slots = 1;
-
- len = strlen("disable");
- if (!strncmp(p, "disable", len)) {
- p += len;
- if (*p == '=')
- ++p;
- if (*p == '\0')
- break;
- ret = kstrtoul(p, 0, &val);
- if (ret)
- break;
-
- bridge = val;
- if (bridge < MAX_PHB_BUS_NUM) {
- printk(KERN_INFO "Calgary: disabling "
- "translation for PHB %#x\n", bridge);
- bus_info[bridge].translation_disabled = 1;
- }
- }
-
- p = strpbrk(p, ",");
- if (!p)
- break;
-
- p++; /* skip ',' */
- }
- return 1;
-}
-__setup("calgary=", calgary_parse_options);
-
-static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
-{
- struct iommu_table *tbl;
- unsigned int npages;
- int i;
-
- tbl = pci_iommu(dev->bus);
-
- for (i = 0; i < 4; i++) {
- struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i];
-
- /* Don't give out TCEs that map MEM resources */
- if (!(r->flags & IORESOURCE_MEM))
- continue;
-
- /* 0-based? we reserve the whole 1st MB anyway */
- if (!r->start)
- continue;
-
- /* cover the whole region */
- npages = resource_size(r) >> PAGE_SHIFT;
- npages++;
-
- iommu_range_reserve(tbl, r->start, npages);
- }
-}
-
-static int __init calgary_fixup_tce_spaces(void)
-{
- struct pci_dev *dev = NULL;
- struct calgary_bus_info *info;
-
- if (no_iommu || swiotlb || !calgary_detected)
- return -ENODEV;
-
- printk(KERN_DEBUG "Calgary: fixing up tce spaces\n");
-
- do {
- dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
- if (!dev)
- break;
- if (!is_cal_pci_dev(dev->device))
- continue;
-
- info = &bus_info[dev->bus->number];
- if (info->translation_disabled)
- continue;
-
- if (!info->tce_space)
- continue;
-
- calgary_fixup_one_tce_space(dev);
-
- } while (1);
-
- return 0;
-}
-
-/*
- * We need to be call after pcibios_assign_resources (fs_initcall level)
- * and before device_initcall.
- */
-rootfs_initcall(calgary_fixup_tce_spaces);
-
-IOMMU_INIT_POST(detect_calgary);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index f62b498b18fb..5dcedad21dff 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/dma-direct.h>
#include <linux/dma-debug.h>
+#include <linux/iommu.h>
#include <linux/dmar.h>
#include <linux/export.h>
#include <linux/memblock.h>
@@ -11,7 +12,6 @@
#include <asm/dma.h>
#include <asm/iommu.h>
#include <asm/gart.h>
-#include <asm/calgary.h>
#include <asm/x86_init.h>
#include <asm/iommu_table.h>
@@ -34,21 +34,6 @@ int no_iommu __read_mostly;
/* Set this to 1 if there is a HW IOMMU in the system */
int iommu_detected __read_mostly = 0;
-/*
- * This variable becomes 1 if iommu=pt is passed on the kernel command line.
- * If this variable is 1, IOMMU implementations do no DMA translation for
- * devices and allow every device to access to whole physical memory. This is
- * useful if a user wants to use an IOMMU only for KVM device assignment to
- * guests and not for driver dma translation.
- * It is also possible to disable by default in kernel config, and enable with
- * iommu=nopt at boot time.
- */
-#ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH
-int iommu_pass_through __read_mostly = 1;
-#else
-int iommu_pass_through __read_mostly;
-#endif
-
extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
void __init pci_iommu_alloc(void)
@@ -120,17 +105,12 @@ static __init int iommu_setup(char *p)
swiotlb = 1;
#endif
if (!strncmp(p, "pt", 2))
- iommu_pass_through = 1;
+ iommu_set_default_passthrough(true);
if (!strncmp(p, "nopt", 4))
- iommu_pass_through = 0;
+ iommu_set_default_translated(true);
gart_parse_options(p);
-#ifdef CONFIG_CALGARY_IOMMU
- if (!strncmp(p, "calgary", 7))
- use_calgary = 1;
-#endif /* CONFIG_CALGARY_IOMMU */
-
p += strcspn(p, ",");
if (*p == ',')
++p;
@@ -160,7 +140,7 @@ rootfs_initcall(pci_iommu_init);
static int via_no_dac_cb(struct pci_dev *pdev, void *data)
{
- pdev->dev.bus_dma_mask = DMA_BIT_MASK(32);
+ pdev->dev.bus_dma_limit = DMA_BIT_MASK(32);
return 0;
}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 5f5302028a9a..c2cfa5e7c152 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -1,5 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
-/* Glue code to lib/swiotlb.c */
#include <linux/pci.h>
#include <linux/cache.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 75fea0d48c0e..839b5244e3b7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -41,6 +41,7 @@
#include <asm/desc.h>
#include <asm/prctl.h>
#include <asm/spec-ctrl.h>
+#include <asm/io_bitmap.h>
#include <asm/proto.h>
#include "process.h"
@@ -72,18 +73,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
#endif
+ .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
},
-#ifdef CONFIG_X86_32
- /*
- * Note that the .io_bitmap member must be extra-big. This is because
- * the CPU will access an additional byte beyond the end of the IO
- * permission bitmap. The extra byte must be all 1 bits, and must
- * be within the limit.
- */
- .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
-#endif
};
EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
@@ -110,28 +102,89 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
void exit_thread(struct task_struct *tsk)
{
struct thread_struct *t = &tsk->thread;
- unsigned long *bp = t->io_bitmap_ptr;
struct fpu *fpu = &t->fpu;
- if (bp) {
- struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
-
- t->io_bitmap_ptr = NULL;
- clear_thread_flag(TIF_IO_BITMAP);
- /*
- * Careful, clear this in the TSS too:
- */
- memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
- t->io_bitmap_max = 0;
- put_cpu();
- kfree(bp);
- }
+ if (test_thread_flag(TIF_IO_BITMAP))
+ io_bitmap_exit();
free_vm86(t);
fpu__drop(fpu);
}
+static int set_new_tls(struct task_struct *p, unsigned long tls)
+{
+ struct user_desc __user *utls = (struct user_desc __user *)tls;
+
+ if (in_ia32_syscall())
+ return do_set_thread_area(p, -1, utls, 0);
+ else
+ return do_set_thread_area_64(p, ARCH_SET_FS, tls);
+}
+
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+ unsigned long arg, struct task_struct *p, unsigned long tls)
+{
+ struct inactive_task_frame *frame;
+ struct fork_frame *fork_frame;
+ struct pt_regs *childregs;
+ int ret = 0;
+
+ childregs = task_pt_regs(p);
+ fork_frame = container_of(childregs, struct fork_frame, regs);
+ frame = &fork_frame->frame;
+
+ frame->bp = 0;
+ frame->ret_addr = (unsigned long) ret_from_fork;
+ p->thread.sp = (unsigned long) fork_frame;
+ p->thread.io_bitmap = NULL;
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
+#ifdef CONFIG_X86_64
+ savesegment(gs, p->thread.gsindex);
+ p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase;
+ savesegment(fs, p->thread.fsindex);
+ p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase;
+ savesegment(es, p->thread.es);
+ savesegment(ds, p->thread.ds);
+#else
+ p->thread.sp0 = (unsigned long) (childregs + 1);
+ /*
+ * Clear all status flags including IF and set fixed bit. 64bit
+ * does not have this initialization as the frame does not contain
+ * flags. The flags consistency (especially vs. AC) is there
+ * ensured via objtool, which lacks 32bit support.
+ */
+ frame->flags = X86_EFLAGS_FIXED;
+#endif
+
+ /* Kernel thread ? */
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ memset(childregs, 0, sizeof(struct pt_regs));
+ kthread_frame_init(frame, sp, arg);
+ return 0;
+ }
+
+ frame->bx = 0;
+ *childregs = *current_pt_regs();
+ childregs->ax = 0;
+ if (sp)
+ childregs->sp = sp;
+
+#ifdef CONFIG_X86_32
+ task_user_gs(p) = get_user_gs(current_pt_regs());
+#endif
+
+ /* Set a new TLS for the child thread? */
+ if (clone_flags & CLONE_SETTLS)
+ ret = set_new_tls(p, tls);
+
+ if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
+ io_bitmap_share(p);
+
+ return ret;
+}
+
void flush_thread(void)
{
struct task_struct *tsk = current;
@@ -269,31 +322,96 @@ void arch_setup_new_exec(void)
}
}
-static inline void switch_to_bitmap(struct thread_struct *prev,
- struct thread_struct *next,
- unsigned long tifp, unsigned long tifn)
+#ifdef CONFIG_X86_IOPL_IOPERM
+static inline void tss_invalidate_io_bitmap(struct tss_struct *tss)
+{
+ /*
+ * Invalidate the I/O bitmap by moving io_bitmap_base outside the
+ * TSS limit so any subsequent I/O access from user space will
+ * trigger a #GP.
+ *
+ * This is correct even when VMEXIT rewrites the TSS limit
+ * to 0x67 as the only requirement is that the base points
+ * outside the limit.
+ */
+ tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
+}
+
+static inline void switch_to_bitmap(unsigned long tifp)
+{
+ /*
+ * Invalidate I/O bitmap if the previous task used it. This prevents
+ * any possible leakage of an active I/O bitmap.
+ *
+ * If the next task has an I/O bitmap it will handle it on exit to
+ * user mode.
+ */
+ if (tifp & _TIF_IO_BITMAP)
+ tss_invalidate_io_bitmap(this_cpu_ptr(&cpu_tss_rw));
+}
+
+static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
+{
+ /*
+ * Copy at least the byte range of the incoming tasks bitmap which
+ * covers the permitted I/O ports.
+ *
+ * If the previous task which used an I/O bitmap had more bits
+ * permitted, then the copy needs to cover those as well so they
+ * get turned off.
+ */
+ memcpy(tss->io_bitmap.bitmap, iobm->bitmap,
+ max(tss->io_bitmap.prev_max, iobm->max));
+
+ /*
+ * Store the new max and the sequence number of this bitmap
+ * and a pointer to the bitmap itself.
+ */
+ tss->io_bitmap.prev_max = iobm->max;
+ tss->io_bitmap.prev_sequence = iobm->sequence;
+}
+
+/**
+ * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode
+ */
+void tss_update_io_bitmap(void)
{
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+ struct thread_struct *t = &current->thread;
+ u16 *base = &tss->x86_tss.io_bitmap_base;
+
+ if (!test_thread_flag(TIF_IO_BITMAP)) {
+ tss_invalidate_io_bitmap(tss);
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
+ *base = IO_BITMAP_OFFSET_VALID_ALL;
+ } else {
+ struct io_bitmap *iobm = t->io_bitmap;
- if (tifn & _TIF_IO_BITMAP) {
- /*
- * Copy the relevant range of the IO bitmap.
- * Normally this is 128 bytes or less:
- */
- memcpy(tss->io_bitmap, next->io_bitmap_ptr,
- max(prev->io_bitmap_max, next->io_bitmap_max));
- /*
- * Make sure that the TSS limit is correct for the CPU
- * to notice the IO bitmap.
- */
- refresh_tss_limit();
- } else if (tifp & _TIF_IO_BITMAP) {
/*
- * Clear any possible leftover bits:
+ * Only copy bitmap data when the sequence number differs. The
+ * update time is accounted to the incoming task.
*/
- memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+ if (tss->io_bitmap.prev_sequence != iobm->sequence)
+ tss_copy_io_bitmap(tss, iobm);
+
+ /* Enable the bitmap */
+ *base = IO_BITMAP_OFFSET_VALID_MAP;
}
+
+ /*
+ * Make sure that the TSS limit is covering the IO bitmap. It might have
+ * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O
+ * access from user space to trigger a #GP because tbe bitmap is outside
+ * the TSS limit.
+ */
+ refresh_tss_limit();
}
+#else /* CONFIG_X86_IOPL_IOPERM */
+static inline void switch_to_bitmap(unsigned long tifp) { }
+#endif
#ifdef CONFIG_SMP
@@ -497,15 +615,12 @@ void speculation_ctrl_update_current(void)
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
{
- struct thread_struct *prev, *next;
unsigned long tifp, tifn;
- prev = &prev_p->thread;
- next = &next_p->thread;
-
tifn = READ_ONCE(task_thread_info(next_p)->flags);
tifp = READ_ONCE(task_thread_info(prev_p)->flags);
- switch_to_bitmap(prev, next, tifp, tifn);
+
+ switch_to_bitmap(tifp);
propagate_user_return_notify(prev_p, next_p);
@@ -580,7 +695,7 @@ void __cpuidle default_idle(void)
safe_halt();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
}
-#ifdef CONFIG_APM_MODULE
+#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
EXPORT_SYMBOL(default_idle);
#endif
diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h
index 320ab978fb1f..1d0797b2338a 100644
--- a/arch/x86/kernel/process.h
+++ b/arch/x86/kernel/process.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
//
// Code shared between 32 and 64 bit
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index b8ceec4974fe..5052ced43373 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -112,74 +112,6 @@ void release_thread(struct task_struct *dead_task)
release_vm86_irqs(dead_task);
}
-int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
- unsigned long arg, struct task_struct *p, unsigned long tls)
-{
- struct pt_regs *childregs = task_pt_regs(p);
- struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs);
- struct inactive_task_frame *frame = &fork_frame->frame;
- struct task_struct *tsk;
- int err;
-
- /*
- * For a new task use the RESET flags value since there is no before.
- * All the status flags are zero; DF and all the system flags must also
- * be 0, specifically IF must be 0 because we context switch to the new
- * task with interrupts disabled.
- */
- frame->flags = X86_EFLAGS_FIXED;
- frame->bp = 0;
- frame->ret_addr = (unsigned long) ret_from_fork;
- p->thread.sp = (unsigned long) fork_frame;
- p->thread.sp0 = (unsigned long) (childregs+1);
- memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
-
- if (unlikely(p->flags & PF_KTHREAD)) {
- /* kernel thread */
- memset(childregs, 0, sizeof(struct pt_regs));
- frame->bx = sp; /* function */
- frame->di = arg;
- p->thread.io_bitmap_ptr = NULL;
- return 0;
- }
- frame->bx = 0;
- *childregs = *current_pt_regs();
- childregs->ax = 0;
- if (sp)
- childregs->sp = sp;
-
- task_user_gs(p) = get_user_gs(current_pt_regs());
-
- p->thread.io_bitmap_ptr = NULL;
- tsk = current;
- err = -ENOMEM;
-
- if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
- p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
- IO_BITMAP_BYTES, GFP_KERNEL);
- if (!p->thread.io_bitmap_ptr) {
- p->thread.io_bitmap_max = 0;
- return -ENOMEM;
- }
- set_tsk_thread_flag(p, TIF_IO_BITMAP);
- }
-
- err = 0;
-
- /*
- * Set a new TLS for the child thread?
- */
- if (clone_flags & CLONE_SETTLS)
- err = do_set_thread_area(p, -1,
- (struct user_desc __user *)tls, 0);
-
- if (err && p->thread.io_bitmap_ptr) {
- kfree(p->thread.io_bitmap_ptr);
- p->thread.io_bitmap_max = 0;
- }
- return err;
-}
-
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
@@ -192,7 +124,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
regs->ip = new_ip;
regs->sp = new_sp;
regs->flags = X86_EFLAGS_IF;
- force_iret();
}
EXPORT_SYMBOL_GPL(start_thread);
@@ -255,15 +186,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
load_TLS(next, cpu);
- /*
- * Restore IOPL if needed. In normal use, the flags restore
- * in the switch assembly will handle this. But if the kernel
- * is running virtualized at a non-zero CPL, the popf will
- * not restore flags, so it must be done in a separate step.
- */
- if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
- set_iopl_mask(next->iopl);
-
switch_to_extra(prev_p, next_p);
/*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index af64519b2695..ffd497804dbc 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -371,81 +371,6 @@ void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
task->thread.gsbase = gsbase;
}
-int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
- unsigned long arg, struct task_struct *p, unsigned long tls)
-{
- int err;
- struct pt_regs *childregs;
- struct fork_frame *fork_frame;
- struct inactive_task_frame *frame;
- struct task_struct *me = current;
-
- childregs = task_pt_regs(p);
- fork_frame = container_of(childregs, struct fork_frame, regs);
- frame = &fork_frame->frame;
-
- frame->bp = 0;
- frame->ret_addr = (unsigned long) ret_from_fork;
- p->thread.sp = (unsigned long) fork_frame;
- p->thread.io_bitmap_ptr = NULL;
-
- savesegment(gs, p->thread.gsindex);
- p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
- savesegment(fs, p->thread.fsindex);
- p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
- savesegment(es, p->thread.es);
- savesegment(ds, p->thread.ds);
- memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
-
- if (unlikely(p->flags & PF_KTHREAD)) {
- /* kernel thread */
- memset(childregs, 0, sizeof(struct pt_regs));
- frame->bx = sp; /* function */
- frame->r12 = arg;
- return 0;
- }
- frame->bx = 0;
- *childregs = *current_pt_regs();
-
- childregs->ax = 0;
- if (sp)
- childregs->sp = sp;
-
- err = -ENOMEM;
- if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
- p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
- IO_BITMAP_BYTES, GFP_KERNEL);
- if (!p->thread.io_bitmap_ptr) {
- p->thread.io_bitmap_max = 0;
- return -ENOMEM;
- }
- set_tsk_thread_flag(p, TIF_IO_BITMAP);
- }
-
- /*
- * Set a new TLS for the child thread?
- */
- if (clone_flags & CLONE_SETTLS) {
-#ifdef CONFIG_IA32_EMULATION
- if (in_ia32_syscall())
- err = do_set_thread_area(p, -1,
- (struct user_desc __user *)tls, 0);
- else
-#endif
- err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
- if (err)
- goto out;
- }
- err = 0;
-out:
- if (err && p->thread.io_bitmap_ptr) {
- kfree(p->thread.io_bitmap_ptr);
- p->thread.io_bitmap_max = 0;
- }
-
- return err;
-}
-
static void
start_thread_common(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp,
@@ -469,7 +394,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
regs->cs = _cs;
regs->ss = _ss;
regs->flags = X86_EFLAGS_IF;
- force_iret();
}
void
@@ -572,17 +496,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
switch_to_extra(prev_p, next_p);
-#ifdef CONFIG_XEN_PV
- /*
- * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
- * current_pt_regs()->flags may not match the current task's
- * intended IOPL. We need to switch it manually.
- */
- if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
- prev->iopl != next->iopl))
- xen_set_iopl_mask(next->iopl);
-#endif
-
if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
/*
* AMD CPUs have a misfeature: SYSRET sets the SS selector but
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 3c5bbe8e4120..f0e1ddbc2fd7 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -42,6 +42,7 @@
#include <asm/traps.h>
#include <asm/syscall.h>
#include <asm/fsgsbase.h>
+#include <asm/io_bitmap.h>
#include "tls.h"
@@ -181,6 +182,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
static int set_segment_reg(struct task_struct *task,
unsigned long offset, u16 value)
{
+ if (WARN_ON_ONCE(task == current))
+ return -EIO;
+
/*
* The value argument was already truncated to 16 bits.
*/
@@ -208,10 +212,7 @@ static int set_segment_reg(struct task_struct *task,
break;
case offsetof(struct user_regs_struct, gs):
- if (task == current)
- set_user_gs(task_pt_regs(task), value);
- else
- task_user_gs(task) = value;
+ task_user_gs(task) = value;
}
return 0;
@@ -271,32 +272,41 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
static int set_segment_reg(struct task_struct *task,
unsigned long offset, u16 value)
{
+ if (WARN_ON_ONCE(task == current))
+ return -EIO;
+
/*
* The value argument was already truncated to 16 bits.
*/
if (invalid_selector(value))
return -EIO;
+ /*
+ * This function has some ABI oddities.
+ *
+ * A 32-bit ptracer probably expects that writing FS or GS will change
+ * FSBASE or GSBASE respectively. In the absence of FSGSBASE support,
+ * this code indeed has that effect. When FSGSBASE is added, this
+ * will require a special case.
+ *
+ * For existing 64-bit ptracers, writing FS or GS *also* currently
+ * changes the base if the selector is nonzero the next time the task
+ * is run. This behavior may not be needed, and trying to preserve it
+ * when FSGSBASE is added would be complicated at best.
+ */
+
switch (offset) {
case offsetof(struct user_regs_struct,fs):
task->thread.fsindex = value;
- if (task == current)
- loadsegment(fs, task->thread.fsindex);
break;
case offsetof(struct user_regs_struct,gs):
task->thread.gsindex = value;
- if (task == current)
- load_gs_index(task->thread.gsindex);
break;
case offsetof(struct user_regs_struct,ds):
task->thread.ds = value;
- if (task == current)
- loadsegment(ds, task->thread.ds);
break;
case offsetof(struct user_regs_struct,es):
task->thread.es = value;
- if (task == current)
- loadsegment(es, task->thread.es);
break;
/*
@@ -374,6 +384,9 @@ static int putreg(struct task_struct *child,
* When changing the FS base, use do_arch_prctl_64()
* to set the index to zero and to set the base
* as requested.
+ *
+ * NB: This behavior is nonsensical and likely needs to
+ * change when FSGSBASE support is added.
*/
if (child->thread.fsbase != value)
return do_arch_prctl_64(child, ARCH_SET_FS, value);
@@ -697,7 +710,9 @@ static int ptrace_set_debugreg(struct task_struct *tsk, int n,
static int ioperm_active(struct task_struct *target,
const struct user_regset *regset)
{
- return target->thread.io_bitmap_max / regset->size;
+ struct io_bitmap *iobm = target->thread.io_bitmap;
+
+ return iobm ? DIV_ROUND_UP(iobm->max, regset->size) : 0;
}
static int ioperm_get(struct task_struct *target,
@@ -705,12 +720,13 @@ static int ioperm_get(struct task_struct *target,
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf)
{
- if (!target->thread.io_bitmap_ptr)
+ struct io_bitmap *iobm = target->thread.io_bitmap;
+
+ if (!iobm)
return -ENXIO;
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- target->thread.io_bitmap_ptr,
- 0, IO_BITMAP_BYTES);
+ iobm->bitmap, 0, IO_BITMAP_BYTES);
}
/*
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 8451f38ad399..896d74cb5081 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -90,8 +90,6 @@ static void ich_force_hpet_resume(void)
BUG();
else
printk(KERN_DEBUG "Force enabled HPET at resume\n");
-
- return;
}
static void ich_force_enable_hpet(struct pci_dev *dev)
@@ -112,7 +110,7 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
}
/* use bits 31:14, 16 kB aligned */
- rcba_base = ioremap_nocache(rcba, 0x4000);
+ rcba_base = ioremap(rcba, 0x4000);
if (rcba_base == NULL) {
dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
"cannot force enable HPET\n");
@@ -448,7 +446,6 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev)
dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
force_hpet_address);
cached_dev = dev;
- return;
}
/* ISA Bridges */
@@ -513,7 +510,6 @@ static void e6xx_force_enable_hpet(struct pci_dev *dev)
force_hpet_resume_type = NONE_FORCE_HPET_RESUME;
dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
"0x%lx\n", force_hpet_address);
- return;
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU,
e6xx_force_enable_hpet);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 09d6bded3c1e..0cc7c0b106bb 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -828,11 +828,6 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
}
-static void smp_send_nmi_allbutself(void)
-{
- apic->send_IPI_allbutself(NMI_VECTOR);
-}
-
/*
* Halt all other CPUs, calling the specified function on each of them
*
@@ -861,7 +856,7 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
*/
wmb();
- smp_send_nmi_allbutself();
+ apic_send_IPI_allbutself(NMI_VECTOR);
/* Kick CPUs looping in NMI context. */
WRITE_ONCE(crash_ipi_issued, 1);
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index ee26df08002e..94b33885f8d2 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -35,8 +35,7 @@
#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
.text
- .globl relocate_kernel
-relocate_kernel:
+SYM_CODE_START_NOALIGN(relocate_kernel)
/* Save the CPU context, used for jumping back */
pushl %ebx
@@ -93,8 +92,9 @@ relocate_kernel:
addl $(identity_mapped - relocate_kernel), %eax
pushl %eax
ret
+SYM_CODE_END(relocate_kernel)
-identity_mapped:
+SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
/* set return address to 0 if not preserving context */
pushl $0
/* store the start address on the stack */
@@ -191,8 +191,9 @@ identity_mapped:
addl $(virtual_mapped - relocate_kernel), %eax
pushl %eax
ret
+SYM_CODE_END(identity_mapped)
-virtual_mapped:
+SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
movl CR4(%edi), %eax
movl %eax, %cr4
movl CR3(%edi), %eax
@@ -208,9 +209,10 @@ virtual_mapped:
popl %esi
popl %ebx
ret
+SYM_CODE_END(virtual_mapped)
/* Do the copies */
-swap_pages:
+SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
movl 8(%esp), %edx
movl 4(%esp), %ecx
pushl %ebp
@@ -270,6 +272,7 @@ swap_pages:
popl %ebx
popl %ebp
ret
+SYM_CODE_END(swap_pages)
.globl kexec_control_code_size
.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index c51ccff5cd01..ef3ba99068d3 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -38,8 +38,7 @@
.text
.align PAGE_SIZE
.code64
- .globl relocate_kernel
-relocate_kernel:
+SYM_CODE_START_NOALIGN(relocate_kernel)
/*
* %rdi indirection_page
* %rsi page_list
@@ -103,8 +102,9 @@ relocate_kernel:
addq $(identity_mapped - relocate_kernel), %r8
pushq %r8
ret
+SYM_CODE_END(relocate_kernel)
-identity_mapped:
+SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
/* set return address to 0 if not preserving context */
pushq $0
/* store the start address on the stack */
@@ -209,8 +209,9 @@ identity_mapped:
movq $virtual_mapped, %rax
pushq %rax
ret
+SYM_CODE_END(identity_mapped)
-virtual_mapped:
+SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
movq RSP(%r8), %rsp
movq CR4(%r8), %rax
movq %rax, %cr4
@@ -228,9 +229,10 @@ virtual_mapped:
popq %rbp
popq %rbx
ret
+SYM_CODE_END(virtual_mapped)
/* Do the copies */
-swap_pages:
+SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
movq %rdi, %rcx /* Put the page_list in %rcx */
xorl %edi, %edi
xorl %esi, %esi
@@ -283,6 +285,7 @@ swap_pages:
jmp 0b
3:
ret
+SYM_CODE_END(swap_pages)
.globl kexec_control_code_size
.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bbe35bf879f5..a74262c71484 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -2,130 +2,54 @@
/*
* Copyright (C) 1995 Linus Torvalds
*
- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- *
- * Memory region support
- * David Parsons <orc@pell.chi.il.us>, July-August 1999
- *
- * Added E820 sanitization routine (removes overlapping memory regions);
- * Brian Moyle <bmoyle@mvista.com>, February 2001
- *
- * Moved CPU detection code to cpu/${cpu}.c
- * Patrick Mochel <mochel@osdl.org>, March 2002
- *
- * Provisions for empty E820 memory regions (reported by certain BIOSes).
- * Alex Achenbach <xela@slit.de>, December 2002.
- *
+ * This file contains the setup_arch() code, which handles the architecture-dependent
+ * parts of early kernel initialization.
*/
-
-/*
- * This file handles the architecture-dependent parts of initialization
- */
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/screen_info.h>
-#include <linux/ioport.h>
-#include <linux/acpi.h>
-#include <linux/sfi.h>
-#include <linux/apm_bios.h>
-#include <linux/initrd.h>
-#include <linux/memblock.h>
-#include <linux/seq_file.h>
#include <linux/console.h>
-#include <linux/root_dev.h>
-#include <linux/highmem.h>
-#include <linux/export.h>
+#include <linux/crash_dump.h>
+#include <linux/dmi.h>
#include <linux/efi.h>
-#include <linux/init.h>
-#include <linux/edd.h>
+#include <linux/init_ohci1394_dma.h>
+#include <linux/initrd.h>
#include <linux/iscsi_ibft.h>
-#include <linux/nodemask.h>
-#include <linux/kexec.h>
-#include <linux/dmi.h>
-#include <linux/pfn.h>
+#include <linux/memblock.h>
#include <linux/pci.h>
-#include <asm/pci-direct.h>
-#include <linux/init_ohci1394_dma.h>
-#include <linux/kvm_para.h>
-#include <linux/dma-contiguous.h>
-#include <xen/xen.h>
-#include <uapi/linux/mount.h>
-
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/delay.h>
-
-#include <linux/kallsyms.h>
-#include <linux/cpufreq.h>
-#include <linux/dma-mapping.h>
-#include <linux/ctype.h>
-#include <linux/uaccess.h>
-
-#include <linux/percpu.h>
-#include <linux/crash_dump.h>
+#include <linux/root_dev.h>
+#include <linux/sfi.h>
#include <linux/tboot.h>
-#include <linux/jiffies.h>
-#include <linux/mem_encrypt.h>
-#include <linux/sizes.h>
-
#include <linux/usb/xhci-dbgp.h>
-#include <video/edid.h>
-#include <asm/mtrr.h>
+#include <uapi/linux/mount.h>
+
+#include <xen/xen.h>
+
#include <asm/apic.h>
-#include <asm/realmode.h>
-#include <asm/e820/api.h>
-#include <asm/mpspec.h>
-#include <asm/setup.h>
-#include <asm/efi.h>
-#include <asm/timer.h>
-#include <asm/i8259.h>
-#include <asm/sections.h>
-#include <asm/io_apic.h>
-#include <asm/ist.h>
-#include <asm/setup_arch.h>
#include <asm/bios_ebda.h>
-#include <asm/cacheflush.h>
-#include <asm/processor.h>
#include <asm/bugs.h>
-#include <asm/kasan.h>
-
-#include <asm/vsyscall.h>
#include <asm/cpu.h>
-#include <asm/desc.h>
-#include <asm/dma.h>
-#include <asm/iommu.h>
+#include <asm/efi.h>
#include <asm/gart.h>
-#include <asm/mmu_context.h>
-#include <asm/proto.h>
-
-#include <asm/paravirt.h>
#include <asm/hypervisor.h>
-#include <asm/olpc_ofw.h>
-
-#include <asm/percpu.h>
-#include <asm/topology.h>
-#include <asm/apicdef.h>
-#include <asm/amd_nb.h>
+#include <asm/io_apic.h>
+#include <asm/kasan.h>
+#include <asm/kaslr.h>
#include <asm/mce.h>
-#include <asm/alternative.h>
+#include <asm/mtrr.h>
+#include <asm/realmode.h>
+#include <asm/olpc_ofw.h>
+#include <asm/pci-direct.h>
#include <asm/prom.h>
-#include <asm/microcode.h>
-#include <asm/kaslr.h>
+#include <asm/proto.h>
#include <asm/unwind.h>
+#include <asm/vsyscall.h>
+#include <linux/vmalloc.h>
/*
- * max_low_pfn_mapped: highest direct mapped pfn under 4GB
- * max_pfn_mapped: highest direct mapped pfn over 4GB
+ * max_low_pfn_mapped: highest directly mapped pfn < 4 GB
+ * max_pfn_mapped: highest directly mapped pfn > 4 GB
*
* The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are
- * represented by pfn_mapped
+ * represented by pfn_mapped[].
*/
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
@@ -135,14 +59,30 @@ RESERVE_BRK(dmi_alloc, 65536);
#endif
-static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
-unsigned long _brk_end = (unsigned long)__brk_base;
+/*
+ * Range of the BSS area. The size of the BSS area is determined
+ * at link time, with RESERVE_BRK*() facility reserving additional
+ * chunks.
+ */
+static __initdata
+unsigned long _brk_start = (unsigned long)__brk_base;
+unsigned long _brk_end = (unsigned long)__brk_base;
struct boot_params boot_params;
/*
- * Machine setup..
+ * These are the four main kernel memory regions, we put them into
+ * the resource tree so that kdump tools and other debugging tools
+ * recover it:
*/
+
+static struct resource rodata_resource = {
+ .name = "Kernel rodata",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
static struct resource data_resource = {
.name = "Kernel data",
.start = 0,
@@ -166,16 +106,16 @@ static struct resource bss_resource = {
#ifdef CONFIG_X86_32
-/* cpu data as detected by the assembly code in head_32.S */
+/* CPU data as detected by the assembly code in head_32.S */
struct cpuinfo_x86 new_cpu_data;
-/* common cpu data for all cpus */
+/* Common CPU data for all CPUs */
struct cpuinfo_x86 boot_cpu_data __read_mostly;
EXPORT_SYMBOL(boot_cpu_data);
unsigned int def_to_bigsmp;
-/* for MCA, but anyone else can use it if they want */
+/* For MCA, but anyone else can use it if they want */
unsigned int machine_id;
unsigned int machine_submodel_id;
unsigned int BIOS_revision;
@@ -438,6 +378,12 @@ static void __init memblock_x86_reserve_range_setup_data(void)
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
memblock_reserve(pa_data, sizeof(*data) + data->len);
+
+ if (data->type == SETUP_INDIRECT &&
+ ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT)
+ memblock_reserve(((struct setup_indirect *)data->data)->addr,
+ ((struct setup_indirect *)data->data)->len);
+
pa_data = data->next;
early_memunmap(data, sizeof(*data));
}
@@ -455,15 +401,15 @@ static void __init memblock_x86_reserve_range_setup_data(void)
/*
* Keep the crash kernel below this limit.
*
- * On 32 bits earlier kernels would limit the kernel to the low 512 MiB
+ * Earlier 32-bits kernels would limit the kernel to the low 512 MB range
* due to mapping restrictions.
*
- * On 64bit, kdump kernel need be restricted to be under 64TB, which is
- * the upper limit of system RAM in 4-level paing mode. Since the kdump
- * jumping could be from 5-level to 4-level, the jumping will fail if
- * kernel is put above 64TB, and there's no way to detect the paging mode
- * of the kernel which will be loaded for dumping during the 1st kernel
- * bootup.
+ * 64-bit kdump kernels need to be restricted to be under 64 TB, which is
+ * the upper limit of system RAM in 4-level paging mode. Since the kdump
+ * jump could be from 5-level paging to 4-level paging, the jump will fail if
+ * the kernel is put above 64 TB, and during the 1st kernel bootup there's
+ * no good way to detect the paging mode of the target kernel which will be
+ * loaded for dumping.
*/
#ifdef CONFIG_X86_32
# define CRASH_ADDR_LOW_MAX SZ_512M
@@ -486,7 +432,7 @@ static int __init reserve_crashkernel_low(void)
ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base);
if (ret) {
/*
- * two parts from lib/swiotlb.c:
+ * two parts from kernel/dma/swiotlb.c:
* -swiotlb size: user-specified with swiotlb= or default.
*
* -swiotlb overflow buffer: now hardcoded to 32k. We round it
@@ -743,8 +689,8 @@ static void __init trim_bios_range(void)
e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
/*
- * special case: Some BIOSen report the PC BIOS
- * area (640->1Mb) as ram even though it is not.
+ * special case: Some BIOSes report the PC BIOS
+ * area (640Kb -> 1Mb) as RAM even though it is not.
* take them out.
*/
e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1);
@@ -874,7 +820,7 @@ void __init setup_arch(char **cmdline_p)
/*
* Note: Quark X1000 CPUs advertise PGE incorrectly and require
* a cr3 based tlb flush, so the following __flush_tlb_all()
- * will not flush anything because the cpu quirk which clears
+ * will not flush anything because the CPU quirk which clears
* X86_FEATURE_PGE has not been invoked yet. Though due to the
* load_cr3() above the TLB has been flushed already. The
* quirk is invoked before subsequent calls to __flush_tlb_all()
@@ -947,11 +893,11 @@ void __init setup_arch(char **cmdline_p)
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = _brk_end;
- mpx_mm_init(&init_mm);
-
code_resource.start = __pa_symbol(_text);
code_resource.end = __pa_symbol(_etext)-1;
- data_resource.start = __pa_symbol(_etext);
+ rodata_resource.start = __pa_symbol(__start_rodata);
+ rodata_resource.end = __pa_symbol(__end_rodata)-1;
+ data_resource.start = __pa_symbol(_sdata);
data_resource.end = __pa_symbol(_edata)-1;
bss_resource.start = __pa_symbol(__bss_start);
bss_resource.end = __pa_symbol(__bss_stop)-1;
@@ -1040,6 +986,7 @@ void __init setup_arch(char **cmdline_p)
/* after parse_early_param, so could debug it */
insert_resource(&iomem_resource, &code_resource);
+ insert_resource(&iomem_resource, &rodata_resource);
insert_resource(&iomem_resource, &data_resource);
insert_resource(&iomem_resource, &bss_resource);
@@ -1122,17 +1069,15 @@ void __init setup_arch(char **cmdline_p)
reserve_bios_regions();
- if (efi_enabled(EFI_MEMMAP)) {
- efi_fake_memmap();
- efi_find_mirror();
- efi_esrt_init();
+ efi_fake_memmap();
+ efi_find_mirror();
+ efi_esrt_init();
- /*
- * The EFI specification says that boot service code won't be
- * called after ExitBootServices(). This is, in fact, a lie.
- */
- efi_reserve_boot_services();
- }
+ /*
+ * The EFI specification says that boot service code won't be
+ * called after ExitBootServices(). This is, in fact, a lie.
+ */
+ efi_reserve_boot_services();
/* preallocate 4k for mptable mpc */
e820__memblock_alloc_reserved_mpc_new();
@@ -1281,8 +1226,6 @@ void __init setup_arch(char **cmdline_p)
#if defined(CONFIG_VGA_CONSOLE)
if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
- conswitchp = &dummy_con;
#endif
#endif
x86_init.oem.banner();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 86663874ef04..e6d7894ad127 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -207,8 +207,8 @@ void __init setup_per_cpu_areas(void)
pcpu_cpu_distance,
pcpu_fc_alloc, pcpu_fc_free);
if (rc < 0)
- pr_warning("%s allocator failed (%d), falling back to page size\n",
- pcpu_fc_names[pcpu_chosen_fc], rc);
+ pr_warn("%s allocator failed (%d), falling back to page size\n",
+ pcpu_fc_names[pcpu_chosen_fc], rc);
}
if (rc < 0)
rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 8eb7193e158d..8a29573851a3 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -151,8 +151,6 @@ static int restore_sigcontext(struct pt_regs *regs,
err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32));
- force_iret();
-
return err;
}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 96421f97e75c..b8d4e9c3c070 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -115,46 +115,6 @@
static atomic_t stopping_cpu = ATOMIC_INIT(-1);
static bool smp_no_nmi_ipi = false;
-/*
- * this function sends a 'reschedule' IPI to another CPU.
- * it goes straight through and wastes no time serializing
- * anything. Worst case is that we lose a reschedule ...
- */
-static void native_smp_send_reschedule(int cpu)
-{
- if (unlikely(cpu_is_offline(cpu))) {
- WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu);
- return;
- }
- apic->send_IPI(cpu, RESCHEDULE_VECTOR);
-}
-
-void native_send_call_func_single_ipi(int cpu)
-{
- apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR);
-}
-
-void native_send_call_func_ipi(const struct cpumask *mask)
-{
- cpumask_var_t allbutself;
-
- if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
- apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
- return;
- }
-
- cpumask_copy(allbutself, cpu_online_mask);
- __cpumask_clear_cpu(smp_processor_id(), allbutself);
-
- if (cpumask_equal(mask, allbutself) &&
- cpumask_equal(cpu_online_mask, cpu_callout_mask))
- apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR);
- else
- apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
-
- free_cpumask_var(allbutself);
-}
-
static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
{
/* We are registered on stopping cpu too, avoid spurious NMI */
@@ -179,6 +139,12 @@ asmlinkage __visible void smp_reboot_interrupt(void)
irq_exit();
}
+static int register_stop_handler(void)
+{
+ return register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
+ NMI_FLAG_FIRST, "smp_stop");
+}
+
static void native_stop_other_cpus(int wait)
{
unsigned long flags;
@@ -209,42 +175,44 @@ static void native_stop_other_cpus(int wait)
/* sync above data before sending IRQ */
wmb();
- apic->send_IPI_allbutself(REBOOT_VECTOR);
+ apic_send_IPI_allbutself(REBOOT_VECTOR);
/*
- * Don't wait longer than a second if the caller
- * didn't ask us to wait.
+ * Don't wait longer than a second for IPI completion. The
+ * wait request is not checked here because that would
+ * prevent an NMI shutdown attempt in case that not all
+ * CPUs reach shutdown state.
*/
timeout = USEC_PER_SEC;
- while (num_online_cpus() > 1 && (wait || timeout--))
+ while (num_online_cpus() > 1 && timeout--)
udelay(1);
}
-
- /* if the REBOOT_VECTOR didn't work, try with the NMI */
- if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) {
- if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
- NMI_FLAG_FIRST, "smp_stop"))
- /* Note: we ignore failures here */
- /* Hope the REBOOT_IRQ is good enough */
- goto finish;
- /* sync above data before sending IRQ */
- wmb();
-
- pr_emerg("Shutting down cpus with NMI\n");
+ /* if the REBOOT_VECTOR didn't work, try with the NMI */
+ if (num_online_cpus() > 1) {
+ /*
+ * If NMI IPI is enabled, try to register the stop handler
+ * and send the IPI. In any case try to wait for the other
+ * CPUs to stop.
+ */
+ if (!smp_no_nmi_ipi && !register_stop_handler()) {
+ /* Sync above data before sending IRQ */
+ wmb();
- apic->send_IPI_allbutself(NMI_VECTOR);
+ pr_emerg("Shutting down cpus with NMI\n");
+ apic_send_IPI_allbutself(NMI_VECTOR);
+ }
/*
- * Don't wait longer than a 10 ms if the caller
- * didn't ask us to wait.
+ * Don't wait longer than 10 ms if the caller didn't
+ * reqeust it. If wait is true, the machine hangs here if
+ * one or more CPUs do not reach shutdown state.
*/
timeout = USEC_PER_MSEC * 10;
while (num_online_cpus() > 1 && (wait || timeout--))
udelay(1);
}
-finish:
local_irq_save(flags);
disable_local_APIC();
mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fdbd47ceb84d..69881b2d446c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1023,8 +1023,6 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
int *cpu0_nmi_registered)
{
- volatile u32 *trampoline_status =
- (volatile u32 *) __va(real_mode_header->trampoline_status);
/* start_ip had better be page-aligned! */
unsigned long start_ip = real_mode_header->trampoline_start;
@@ -1116,9 +1114,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
}
}
- /* mark "stuck" area as not stuck */
- *trampoline_status = 0;
-
if (x86_platform.legacy.warm_reset) {
/*
* Cleanup possible dangling ends...
@@ -1596,7 +1591,12 @@ int native_cpu_disable(void)
if (ret)
return ret;
- clear_local_APIC();
+ /*
+ * Disable the local APIC. Otherwise IPI broadcasts will reach
+ * it. It still responds normally to INIT, NMI, SMI, and SIPI
+ * messages.
+ */
+ apic_soft_disable();
cpu_disable_common();
return 0;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index f7476ce23b6e..ca3c11a17b5a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -22,7 +22,6 @@
#include <asm/elf.h>
#include <asm/ia32.h>
#include <asm/syscalls.h>
-#include <asm/mpx.h>
/*
* Align a virtual address to avoid aliasing in the I$ on AMD F15h.
@@ -137,10 +136,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_unmapped_area_info info;
unsigned long begin, end;
- addr = mpx_unmapped_area_check(addr, len, flags);
- if (IS_ERR_VALUE(addr))
- return addr;
-
if (flags & MAP_FIXED)
return addr;
@@ -180,10 +175,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
unsigned long addr = addr0;
struct vm_unmapped_area_info info;
- addr = mpx_unmapped_area_check(addr, len, flags);
- if (IS_ERR_VALUE(addr))
- return addr;
-
/* requested length too big for entire address space */
if (len > TASK_SIZE)
return -ENOMEM;
diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c
index 01f0e2263b86..298fc1edd9c9 100644
--- a/arch/x86/kernel/sysfb_simplefb.c
+++ b/arch/x86/kernel/sysfb_simplefb.c
@@ -90,11 +90,11 @@ __init int create_simplefb(const struct screen_info *si,
if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
size <<= 16;
length = mode->height * mode->stride;
- length = PAGE_ALIGN(length);
if (length > size) {
printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
return -EINVAL;
}
+ length = PAGE_ALIGN(length);
/* setup IORESOURCE_MEM as framebuffer memory */
memset(&res, 0, sizeof(res));
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index a49fe1dcb47e..b89f6ac6a0c0 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -57,7 +57,7 @@ void __init tboot_probe(void)
*/
if (!e820__mapped_any(boot_params.tboot_addr,
boot_params.tboot_addr, E820_TYPE_RESERVED)) {
- pr_warning("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n");
+ pr_warn("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n");
return;
}
@@ -65,13 +65,12 @@ void __init tboot_probe(void)
set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
- pr_warning("tboot at 0x%llx is invalid\n",
- boot_params.tboot_addr);
+ pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr);
tboot = NULL;
return;
}
if (tboot->version < 5) {
- pr_warning("tboot version is invalid: %u\n", tboot->version);
+ pr_warn("tboot version is invalid: %u\n", tboot->version);
tboot = NULL;
return;
}
@@ -289,7 +288,7 @@ static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
if (sleep_state >= ACPI_S_STATE_COUNT ||
acpi_shutdown_map[sleep_state] == -1) {
- pr_warning("unsupported sleep state 0x%x\n", sleep_state);
+ pr_warn("unsupported sleep state 0x%x\n", sleep_state);
return -1;
}
@@ -302,7 +301,7 @@ static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b)
if (!tboot_enabled())
return 0;
- pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)");
+ pr_warn("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)");
return -ENODEV;
}
@@ -320,7 +319,7 @@ static int tboot_wait_for_aps(int num_aps)
}
if (timeout)
- pr_warning("tboot wait for APs timeout\n");
+ pr_warn("tboot wait for APs timeout\n");
return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
}
@@ -355,7 +354,7 @@ static ssize_t tboot_log_read(struct file *file, char __user *user_buf, size_t c
void *kbuf;
int ret = -EFAULT;
- log_base = ioremap_nocache(TBOOT_SERIAL_LOG_ADDR, TBOOT_SERIAL_LOG_SIZE);
+ log_base = ioremap(TBOOT_SERIAL_LOG_ADDR, TBOOT_SERIAL_LOG_SIZE);
if (!log_base)
return ret;
@@ -516,7 +515,7 @@ int tboot_force_iommu(void)
return 1;
if (no_iommu || swiotlb || dmar_disabled)
- pr_warning("Forcing Intel-IOMMU to enabled\n");
+ pr_warn("Forcing Intel-IOMMU to enabled\n");
dmar_disabled = 0;
#ifdef CONFIG_SWIOTLB
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
deleted file mode 100644
index 6384be751eff..000000000000
--- a/arch/x86/kernel/tce_64.c
+++ /dev/null
@@ -1,177 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * This file manages the translation entries for the IBM Calgary IOMMU.
- *
- * Derived from arch/powerpc/platforms/pseries/iommu.c
- *
- * Copyright (C) IBM Corporation, 2006
- *
- * Author: Jon Mason <jdmason@us.ibm.com>
- * Author: Muli Ben-Yehuda <muli@il.ibm.com>
- */
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/memblock.h>
-#include <asm/tce.h>
-#include <asm/calgary.h>
-#include <asm/proto.h>
-#include <asm/cacheflush.h>
-
-/* flush a tce at 'tceaddr' to main memory */
-static inline void flush_tce(void* tceaddr)
-{
- /* a single tce can't cross a cache line */
- if (boot_cpu_has(X86_FEATURE_CLFLUSH))
- clflush(tceaddr);
- else
- wbinvd();
-}
-
-void tce_build(struct iommu_table *tbl, unsigned long index,
- unsigned int npages, unsigned long uaddr, int direction)
-{
- u64* tp;
- u64 t;
- u64 rpn;
-
- t = (1 << TCE_READ_SHIFT);
- if (direction != DMA_TO_DEVICE)
- t |= (1 << TCE_WRITE_SHIFT);
-
- tp = ((u64*)tbl->it_base) + index;
-
- while (npages--) {
- rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT;
- t &= ~TCE_RPN_MASK;
- t |= (rpn << TCE_RPN_SHIFT);
-
- *tp = cpu_to_be64(t);
- flush_tce(tp);
-
- uaddr += PAGE_SIZE;
- tp++;
- }
-}
-
-void tce_free(struct iommu_table *tbl, long index, unsigned int npages)
-{
- u64* tp;
-
- tp = ((u64*)tbl->it_base) + index;
-
- while (npages--) {
- *tp = cpu_to_be64(0);
- flush_tce(tp);
- tp++;
- }
-}
-
-static inline unsigned int table_size_to_number_of_entries(unsigned char size)
-{
- /*
- * size is the order of the table, 0-7
- * smallest table is 8K entries, so shift result by 13 to
- * multiply by 8K
- */
- return (1 << size) << 13;
-}
-
-static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
-{
- unsigned int bitmapsz;
- unsigned long bmppages;
- int ret;
-
- tbl->it_busno = dev->bus->number;
-
- /* set the tce table size - measured in entries */
- tbl->it_size = table_size_to_number_of_entries(specified_table_size);
-
- /*
- * number of bytes needed for the bitmap size in number of
- * entries; we need one bit per entry
- */
- bitmapsz = tbl->it_size / BITS_PER_BYTE;
- bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz));
- if (!bmppages) {
- printk(KERN_ERR "Calgary: cannot allocate bitmap\n");
- ret = -ENOMEM;
- goto done;
- }
-
- tbl->it_map = (unsigned long*)bmppages;
-
- memset(tbl->it_map, 0, bitmapsz);
-
- tbl->it_hint = 0;
-
- spin_lock_init(&tbl->it_lock);
-
- return 0;
-
-done:
- return ret;
-}
-
-int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar)
-{
- struct iommu_table *tbl;
- int ret;
-
- if (pci_iommu(dev->bus)) {
- printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n",
- dev, pci_iommu(dev->bus));
- BUG();
- }
-
- tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL);
- if (!tbl) {
- printk(KERN_ERR "Calgary: error allocating iommu_table\n");
- ret = -ENOMEM;
- goto done;
- }
-
- ret = tce_table_setparms(dev, tbl);
- if (ret)
- goto free_tbl;
-
- tbl->bbar = bbar;
-
- set_pci_iommu(dev->bus, tbl);
-
- return 0;
-
-free_tbl:
- kfree(tbl);
-done:
- return ret;
-}
-
-void * __init alloc_tce_table(void)
-{
- unsigned int size;
-
- size = table_size_to_number_of_entries(specified_table_size);
- size *= TCE_ENTRY_SIZE;
-
- return memblock_alloc_low(size, size);
-}
-
-void __init free_tce_table(void *tbl)
-{
- unsigned int size;
-
- if (!tbl)
- return;
-
- size = table_size_to_number_of_entries(specified_table_size);
- size *= TCE_ENTRY_SIZE;
-
- memblock_free(__pa(tbl), size);
-}
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 7ce29cee9f9e..d8673d8a779b 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -91,10 +91,18 @@ void __init hpet_time_init(void)
static __init void x86_late_time_init(void)
{
+ /*
+ * Before PIT/HPET init, select the interrupt mode. This is required
+ * to make the decision whether PIT should be initialized correct.
+ */
+ x86_init.irqs.intr_mode_select();
+
+ /* Setup the legacy timers */
x86_init.timers.timer_init();
+
/*
- * After PIT/HPET timers init, select and setup
- * the final interrupt mode for delivering IRQs.
+ * After PIT/HPET timers init, set up the final interrupt mode for
+ * delivering IRQs.
*/
x86_init.irqs.intr_mode_init();
tsc_init();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4bb0f8447112..6ef00eb6fbb9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -37,11 +37,6 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/io.h>
-
-#if defined(CONFIG_EDAC)
-#include <linux/edac.h>
-#endif
-
#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
@@ -57,10 +52,10 @@
#include <asm/mach_traps.h>
#include <asm/alternative.h>
#include <asm/fpu/xstate.h>
-#include <asm/trace/mpx.h>
-#include <asm/mpx.h>
#include <asm/vm86.h>
#include <asm/umip.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -311,8 +306,23 @@ __visible void __noreturn handle_stack_overflow(const char *message,
}
#endif
-#ifdef CONFIG_X86_64
-/* Runs on IST stack */
+#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT)
+/*
+ * Runs on an IST stack for x86_64 and on a special task stack for x86_32.
+ *
+ * On x86_64, this is more or less a normal kernel entry. Notwithstanding the
+ * SDM's warnings about double faults being unrecoverable, returning works as
+ * expected. Presumably what the SDM actually means is that the CPU may get
+ * the register state wrong on entry, so returning could be a bad idea.
+ *
+ * Various CPU engineers have promised that double faults due to an IRET fault
+ * while the stack is read-only are, in fact, recoverable.
+ *
+ * On x86_32, this is entered through a task gate, and regs are synthesized
+ * from the TSS. Returning is, in principle, okay, but changes to regs will
+ * be lost. If, for some reason, we need to return to a context with modified
+ * regs, the shim code could be adjusted to synchronize the registers.
+ */
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
{
static const char str[] = "double fault";
@@ -416,22 +426,14 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
#endif
-#ifdef CONFIG_DOUBLEFAULT
- df_debug(regs, error_code);
-#endif
- /*
- * This is always a kernel trap and never fixable (and thus must
- * never return).
- */
- for (;;)
- die(str, regs, error_code);
+ pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
+ die("double fault", regs, error_code);
+ panic("Machine halted.");
}
#endif
dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
{
- const struct mpx_bndcsr *bndcsr;
-
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
if (notify_die(DIE_TRAP, "bounds", regs, error_code,
X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
@@ -441,84 +443,60 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
if (!user_mode(regs))
die("bounds", regs, error_code);
- if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
- /* The exception is not from Intel MPX */
- goto exit_trap;
- }
+ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
+}
- /*
- * We need to look at BNDSTATUS to resolve this exception.
- * A NULL here might mean that it is in its 'init state',
- * which is all zeros which indicates MPX was not
- * responsible for the exception.
- */
- bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR);
- if (!bndcsr)
- goto exit_trap;
+enum kernel_gp_hint {
+ GP_NO_HINT,
+ GP_NON_CANONICAL,
+ GP_CANONICAL
+};
- trace_bounds_exception_mpx(bndcsr);
- /*
- * The error code field of the BNDSTATUS register communicates status
- * information of a bound range exception #BR or operation involving
- * bound directory.
- */
- switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
- case 2: /* Bound directory has invalid entry. */
- if (mpx_handle_bd_fault())
- goto exit_trap;
- break; /* Success, it was handled */
- case 1: /* Bound violation. */
- {
- struct task_struct *tsk = current;
- struct mpx_fault_info mpx;
-
- if (mpx_fault_info(&mpx, regs)) {
- /*
- * We failed to decode the MPX instruction. Act as if
- * the exception was not caused by MPX.
- */
- goto exit_trap;
- }
- /*
- * Success, we decoded the instruction and retrieved
- * an 'mpx' containing the address being accessed
- * which caused the exception. This information
- * allows and application to possibly handle the
- * #BR exception itself.
- */
- if (!do_trap_no_signal(tsk, X86_TRAP_BR, "bounds", regs,
- error_code))
- break;
+/*
+ * When an uncaught #GP occurs, try to determine the memory address accessed by
+ * the instruction and return that address to the caller. Also, try to figure
+ * out whether any part of the access to that address was non-canonical.
+ */
+static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
+ unsigned long *addr)
+{
+ u8 insn_buf[MAX_INSN_SIZE];
+ struct insn insn;
- show_signal(tsk, SIGSEGV, "trap ", "bounds", regs, error_code);
+ if (probe_kernel_read(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
+ return GP_NO_HINT;
- force_sig_bnderr(mpx.addr, mpx.lower, mpx.upper);
- break;
- }
- case 0: /* No exception caused by Intel MPX operations. */
- goto exit_trap;
- default:
- die("bounds", regs, error_code);
- }
+ kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE);
+ insn_get_modrm(&insn);
+ insn_get_sib(&insn);
- return;
+ *addr = (unsigned long)insn_get_addr_ref(&insn, regs);
+ if (*addr == -1UL)
+ return GP_NO_HINT;
-exit_trap:
+#ifdef CONFIG_X86_64
/*
- * This path out is for all the cases where we could not
- * handle the exception in some way (like allocating a
- * table or telling userspace about it. We will also end
- * up here if the kernel has MPX turned off at compile
- * time..
+ * Check that:
+ * - the operand is not in the kernel half
+ * - the last byte of the operand is not in the user canonical half
*/
- do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
+ if (*addr < ~__VIRTUAL_MASK &&
+ *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK)
+ return GP_NON_CANONICAL;
+#endif
+
+ return GP_CANONICAL;
}
-dotraplinkage void
-do_general_protection(struct pt_regs *regs, long error_code)
+#define GPFSTR "general protection fault"
+
+dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code)
{
- const char *desc = "general protection fault";
+ char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
+ enum kernel_gp_hint hint = GP_NO_HINT;
struct task_struct *tsk;
+ unsigned long gp_addr;
+ int ret;
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
cond_local_irq_enable(regs);
@@ -535,48 +513,61 @@ do_general_protection(struct pt_regs *regs, long error_code)
}
tsk = current;
- if (!user_mode(regs)) {
- if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
- return;
+ if (user_mode(regs)) {
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
- /*
- * To be potentially processing a kprobe fault and to
- * trust the result from kprobe_running(), we have to
- * be non-preemptible.
- */
- if (!preemptible() && kprobe_running() &&
- kprobe_fault_handler(regs, X86_TRAP_GP))
- return;
+ show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
+ force_sig(SIGSEGV);
- if (notify_die(DIE_GPF, desc, regs, error_code,
- X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
- die(desc, regs, error_code);
return;
}
+ if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
+ return;
+
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
- show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
+ /*
+ * To be potentially processing a kprobe fault and to trust the result
+ * from kprobe_running(), we have to be non-preemptible.
+ */
+ if (!preemptible() &&
+ kprobe_running() &&
+ kprobe_fault_handler(regs, X86_TRAP_GP))
+ return;
+
+ ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV);
+ if (ret == NOTIFY_STOP)
+ return;
+
+ if (error_code)
+ snprintf(desc, sizeof(desc), "segment-related " GPFSTR);
+ else
+ hint = get_kernel_gp_address(regs, &gp_addr);
+
+ if (hint != GP_NO_HINT)
+ snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx",
+ (hint == GP_NON_CANONICAL) ? "probably for non-canonical address"
+ : "maybe for address",
+ gp_addr);
+
+ /*
+ * KASAN is interested only in the non-canonical case, clear it
+ * otherwise.
+ */
+ if (hint != GP_NON_CANONICAL)
+ gp_addr = 0;
+
+ die_addr(desc, regs, error_code, gp_addr);
- force_sig(SIGSEGV);
}
NOKPROBE_SYMBOL(do_general_protection);
dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
{
-#ifdef CONFIG_DYNAMIC_FTRACE
- /*
- * ftrace must be first, everything else may cause a recursive crash.
- * See note by declaration of modifying_ftrace_code in ftrace.c
- */
- if (unlikely(atomic_read(&modifying_ftrace_code)) &&
- ftrace_int3_handler(regs))
- return;
-#endif
if (poke_int3_handler(regs))
return;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 57d87f79558f..7e322e2daaf5 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -638,7 +638,7 @@ unsigned long native_calibrate_tsc(void)
* clock.
*/
if (crystal_khz == 0 &&
- boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_X)
+ boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_D)
crystal_khz = 25000;
/*
@@ -1505,6 +1505,9 @@ void __init tsc_init(void)
return;
}
+ if (tsc_clocksource_reliable || no_tsc_watchdog)
+ clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+
clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
detect_art();
}
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 067858fe4db8..e0cbe4f2af49 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -58,6 +58,10 @@ static const struct freq_desc freq_desc_ann = {
1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 }
};
+static const struct freq_desc freq_desc_lgm = {
+ 1, { 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000 }
+};
+
static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
INTEL_CPU_FAM6(ATOM_SALTWELL_MID, freq_desc_pnw),
INTEL_CPU_FAM6(ATOM_SALTWELL_TABLET, freq_desc_clv),
@@ -65,6 +69,7 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, freq_desc_tng),
INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht),
INTEL_CPU_FAM6(ATOM_AIRMONT_MID, freq_desc_ann),
+ INTEL_CPU_FAM6(ATOM_AIRMONT_NP, freq_desc_lgm),
{}
};
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index ec534f978867..32a818764e03 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -233,7 +233,6 @@ static cycles_t check_tsc_warp(unsigned int timeout)
* The measurement runs for 'timeout' msecs:
*/
end = start + (cycles_t) tsc_khz * timeout;
- now = start;
for (i = 0; ; i++) {
/*
@@ -364,12 +363,12 @@ retry:
/* Force it to 0 if random warps brought us here */
atomic_set(&test_runs, 0);
- pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
+ pr_warn("TSC synchronization [CPU#%d -> CPU#%d]:\n",
smp_processor_id(), cpu);
- pr_warning("Measured %Ld cycles TSC warp between CPUs, "
- "turning off TSC clock.\n", max_warp);
+ pr_warn("Measured %Ld cycles TSC warp between CPUs, "
+ "turning off TSC clock.\n", max_warp);
if (random_warps)
- pr_warning("TSC warped randomly between CPUs\n");
+ pr_warn("TSC warped randomly between CPUs\n");
mark_tsc_unstable("check_tsc_sync_source failed");
}
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index 5b345add550f..4d732a444711 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -1,6 +1,6 @@
/*
- * umip.c Emulation for instruction protected by the Intel User-Mode
- * Instruction Prevention feature
+ * umip.c Emulation for instruction protected by the User-Mode Instruction
+ * Prevention feature
*
* Copyright (c) 2017, Intel Corporation.
* Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
@@ -18,10 +18,10 @@
/** DOC: Emulation for User-Mode Instruction Prevention (UMIP)
*
- * The feature User-Mode Instruction Prevention present in recent Intel
- * processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str)
- * from being executed with CPL > 0. Otherwise, a general protection fault is
- * issued.
+ * User-Mode Instruction Prevention is a security feature present in recent
+ * x86 processors that, when enabled, prevents a group of instructions (SGDT,
+ * SIDT, SLDT, SMSW and STR) from being run in user mode by issuing a general
+ * protection fault if the instruction is executed with CPL > 0.
*
* Rather than relaying to the user space the general protection fault caused by
* the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be
@@ -36,8 +36,8 @@
* DOSEMU2) rely on this subset of instructions to function.
*
* The instructions protected by UMIP can be split in two groups. Those which
- * return a kernel memory address (sgdt and sidt) and those which return a
- * value (sldt, str and smsw).
+ * return a kernel memory address (SGDT and SIDT) and those which return a
+ * value (SLDT, STR and SMSW).
*
* For the instructions that return a kernel memory address, applications
* such as WineHQ rely on the result being located in the kernel memory space,
@@ -45,15 +45,13 @@
* value that, lies close to the top of the kernel memory. The limit for the GDT
* and the IDT are set to zero.
*
- * Given that sldt and str are not commonly used in programs that run on WineHQ
+ * Given that SLDT and STR are not commonly used in programs that run on WineHQ
* or DOSEMU2, they are not emulated.
*
* The instruction smsw is emulated to return the value that the register CR0
* has at boot time as set in the head_32.
*
- * Also, emulation is provided only for 32-bit processes; 64-bit processes
- * that attempt to use the instructions that UMIP protects will receive the
- * SIGSEGV signal issued as a consequence of the general protection fault.
+ * Emulation is provided for both 32-bit and 64-bit processes.
*
* Care is taken to appropriately emulate the results when segmentation is
* used. That is, rather than relying on USER_DS and USER_CS, the function
@@ -63,17 +61,18 @@
* application uses a local descriptor table.
*/
-#define UMIP_DUMMY_GDT_BASE 0xfffe0000
-#define UMIP_DUMMY_IDT_BASE 0xffff0000
+#define UMIP_DUMMY_GDT_BASE 0xfffffffffffe0000ULL
+#define UMIP_DUMMY_IDT_BASE 0xffffffffffff0000ULL
/*
* The SGDT and SIDT instructions store the contents of the global descriptor
* table and interrupt table registers, respectively. The destination is a
* memory operand of X+2 bytes. X bytes are used to store the base address of
- * the table and 2 bytes are used to store the limit. In 32-bit processes, the
- * only processes for which emulation is provided, X has a value of 4.
+ * the table and 2 bytes are used to store the limit. In 32-bit processes X
+ * has a value of 4, in 64-bit processes X has a value of 8.
*/
-#define UMIP_GDT_IDT_BASE_SIZE 4
+#define UMIP_GDT_IDT_BASE_SIZE_64BIT 8
+#define UMIP_GDT_IDT_BASE_SIZE_32BIT 4
#define UMIP_GDT_IDT_LIMIT_SIZE 2
#define UMIP_INST_SGDT 0 /* 0F 01 /0 */
@@ -92,7 +91,7 @@ const char * const umip_insns[5] = {
#define umip_pr_err(regs, fmt, ...) \
umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__)
-#define umip_pr_warning(regs, fmt, ...) \
+#define umip_pr_warn(regs, fmt, ...) \
umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__)
/**
@@ -189,6 +188,7 @@ static int identify_insn(struct insn *insn)
* @umip_inst: A constant indicating the instruction to emulate
* @data: Buffer into which the dummy result is stored
* @data_size: Size of the emulated result
+ * @x86_64: true if process is 64-bit, false otherwise
*
* Emulate an instruction protected by UMIP and provide a dummy result. The
* result of the emulation is saved in @data. The size of the results depends
@@ -202,11 +202,8 @@ static int identify_insn(struct insn *insn)
* 0 on success, -EINVAL on error while emulating.
*/
static int emulate_umip_insn(struct insn *insn, int umip_inst,
- unsigned char *data, int *data_size)
+ unsigned char *data, int *data_size, bool x86_64)
{
- unsigned long dummy_base_addr, dummy_value;
- unsigned short dummy_limit = 0;
-
if (!data || !data_size || !insn)
return -EINVAL;
/*
@@ -219,6 +216,9 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst,
* is always returned irrespective of the operand size.
*/
if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) {
+ u64 dummy_base_addr;
+ u16 dummy_limit = 0;
+
/* SGDT and SIDT do not use registers operands. */
if (X86_MODRM_MOD(insn->modrm.value) == 3)
return -EINVAL;
@@ -228,13 +228,24 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst,
else
dummy_base_addr = UMIP_DUMMY_IDT_BASE;
- *data_size = UMIP_GDT_IDT_LIMIT_SIZE + UMIP_GDT_IDT_BASE_SIZE;
+ /*
+ * 64-bit processes use the entire dummy base address.
+ * 32-bit processes use the lower 32 bits of the base address.
+ * dummy_base_addr is always 64 bits, but we memcpy the correct
+ * number of bytes from it to the destination.
+ */
+ if (x86_64)
+ *data_size = UMIP_GDT_IDT_BASE_SIZE_64BIT;
+ else
+ *data_size = UMIP_GDT_IDT_BASE_SIZE_32BIT;
+
+ memcpy(data + 2, &dummy_base_addr, *data_size);
- memcpy(data + 2, &dummy_base_addr, UMIP_GDT_IDT_BASE_SIZE);
+ *data_size += UMIP_GDT_IDT_LIMIT_SIZE;
memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE);
} else if (umip_inst == UMIP_INST_SMSW) {
- dummy_value = CR0_STATE;
+ unsigned long dummy_value = CR0_STATE;
/*
* Even though the CR0 register has 4 bytes, the number
@@ -290,11 +301,10 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
* fixup_umip_exception() - Fixup a general protection fault caused by UMIP
* @regs: Registers as saved when entering the #GP handler
*
- * The instructions sgdt, sidt, str, smsw, sldt cause a general protection
- * fault if executed with CPL > 0 (i.e., from user space). If the offending
- * user-space process is not in long mode, this function fixes the exception
- * up and provides dummy results for sgdt, sidt and smsw; str and sldt are not
- * fixed up. Also long mode user-space processes are not fixed up.
+ * The instructions SGDT, SIDT, STR, SMSW and SLDT cause a general protection
+ * fault if executed with CPL > 0 (i.e., from user space). This function fixes
+ * the exception up and provides dummy results for SGDT, SIDT and SMSW; STR
+ * and SLDT are not fixed up.
*
* If operands are memory addresses, results are copied to user-space memory as
* indicated by the instruction pointed by eIP using the registers indicated in
@@ -370,16 +380,17 @@ bool fixup_umip_exception(struct pt_regs *regs)
if (umip_inst < 0)
return false;
- umip_pr_warning(regs, "%s instruction cannot be used by applications.\n",
+ umip_pr_warn(regs, "%s instruction cannot be used by applications.\n",
umip_insns[umip_inst]);
- /* Do not emulate SLDT, STR or user long mode processes. */
- if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT || user_64bit_mode(regs))
+ /* Do not emulate (spoof) SLDT or STR. */
+ if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT)
return false;
- umip_pr_warning(regs, "For now, expensive software emulation returns the result.\n");
+ umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n");
- if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size))
+ if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size,
+ user_64bit_mode(regs)))
return false;
/*
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 332ae6530fa8..e9cc182aa97e 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -187,6 +187,8 @@ static struct orc_entry *orc_find(unsigned long ip)
return orc_ftrace_find(ip);
}
+#ifdef CONFIG_MODULES
+
static void orc_sort_swap(void *_a, void *_b, int size)
{
struct orc_entry *orc_a, *orc_b;
@@ -229,7 +231,6 @@ static int orc_sort_cmp(const void *_a, const void *_b)
return orc_a->sp_reg == ORC_REG_UNDEFINED && !orc_a->end ? -1 : 1;
}
-#ifdef CONFIG_MODULES
void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
void *_orc, size_t orc_size)
{
@@ -273,9 +274,11 @@ void __init unwind_init(void)
return;
}
- /* Sort the .orc_unwind and .orc_unwind_ip tables: */
- sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp,
- orc_sort_swap);
+ /*
+ * Note, the orc_unwind and orc_unwind_ip tables were already
+ * sorted at build time via the 'sorttable' tool.
+ * It's ready for binary search straight away, no need to sort it.
+ */
/* Initialize the fast lookup table: */
lookup_num_blocks = orc_lookup_end - orc_lookup;
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index d8359ebeea70..15e5aad8ac2c 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -508,9 +508,12 @@ struct uprobe_xol_ops {
void (*abort)(struct arch_uprobe *, struct pt_regs *);
};
-static inline int sizeof_long(void)
+static inline int sizeof_long(struct pt_regs *regs)
{
- return in_ia32_syscall() ? 4 : 8;
+ /*
+ * Check registers for mode as in_xxx_syscall() does not apply here.
+ */
+ return user_64bit_mode(regs) ? 8 : 4;
}
static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
@@ -521,9 +524,9 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
{
- unsigned long new_sp = regs->sp - sizeof_long();
+ unsigned long new_sp = regs->sp - sizeof_long(regs);
- if (copy_to_user((void __user *)new_sp, &val, sizeof_long()))
+ if (copy_to_user((void __user *)new_sp, &val, sizeof_long(regs)))
return -EFAULT;
regs->sp = new_sp;
@@ -556,7 +559,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs
long correction = utask->vaddr - utask->xol_vaddr;
regs->ip += correction;
} else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
- regs->sp += sizeof_long(); /* Pop incorrect return address */
+ regs->sp += sizeof_long(regs); /* Pop incorrect return address */
if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))
return -ERESTART;
}
@@ -675,7 +678,7 @@ static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
* "call" insn was executed out-of-line. Just restore ->sp and restart.
* We could also restore ->ip and try to call branch_emulate_op() again.
*/
- regs->sp += sizeof_long();
+ regs->sp += sizeof_long(regs);
return -ERESTART;
}
@@ -839,8 +842,8 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
/**
* arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
+ * @auprobe: the probepoint information.
* @mm: the probed address space.
- * @arch_uprobe: the probepoint information.
* @addr: virtual address at which to install the probepoint
* Return 0 on success or a -ve number on error.
*/
@@ -1056,7 +1059,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
unsigned long
arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
{
- int rasize = sizeof_long(), nleft;
+ int rasize = sizeof_long(regs), nleft;
unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index a024c4f7ba56..641f0fe1e5b4 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -31,7 +31,7 @@
#include <asm/cpufeatures.h>
#include <asm/msr-index.h>
-ENTRY(verify_cpu)
+SYM_FUNC_START_LOCAL(verify_cpu)
pushf # Save caller passed flags
push $0 # Kill any dangerous flags
popf
@@ -137,4 +137,4 @@ ENTRY(verify_cpu)
popf # Restore caller passed flags
xorl %eax, %eax
ret
-ENDPROC(verify_cpu)
+SYM_FUNC_END(verify_cpu)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index a76c12b38e92..91d55454e702 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -381,7 +381,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
mark_screen_rdonly(tsk->mm);
memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
- force_iret();
return regs->ax;
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e2feacf921a0..e3296aa028fe 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -21,6 +21,9 @@
#define LOAD_OFFSET __START_KERNEL_map
#endif
+#define EMITS_PT_NOTE
+#define RO_EXCEPTION_TABLE_ALIGN 16
+
#include <asm-generic/vmlinux.lds.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
@@ -141,17 +144,12 @@ SECTIONS
*(.text.__x86.indirect_thunk)
__indirect_thunk_end = .;
#endif
+ } :text =0xcccc
- /* End of text section */
- _etext = .;
- } :text = 0x9090
-
- NOTES :text :note
-
- EXCEPTION_TABLE(16) :text = 0x9090
-
- /* .text should occupy whole number of pages */
+ /* End of text section, which should occupy whole number of pages */
+ _etext = .;
. = ALIGN(PAGE_SIZE);
+
X86_ALIGN_RODATA_BEGIN
RO_DATA(PAGE_SIZE)
X86_ALIGN_RODATA_END
@@ -195,12 +193,10 @@ SECTIONS
__vvar_beginning_hack = .;
/* Place all vvars at the offsets in asm/vvar.h. */
-#define EMIT_VVAR(name, offset) \
+#define EMIT_VVAR(name, offset) \
. = __vvar_beginning_hack + offset; \
*(.vvar_ ## name)
-#define __VVAR_KERNEL_LDS
#include <asm/vvar.h>
-#undef __VVAR_KERNEL_LDS
#undef EMIT_VVAR
/*
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 1bef687faf22..85f1a90c55cd 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -20,7 +20,7 @@
#include <asm/irq.h>
#include <asm/io_apic.h>
#include <asm/hpet.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/tsc.h>
#include <asm/iommu.h>
#include <asm/mach_traps.h>
@@ -31,6 +31,28 @@ static int __init iommu_init_noop(void) { return 0; }
static void iommu_shutdown_noop(void) { }
bool __init bool_x86_init_noop(void) { return false; }
void x86_op_int_noop(int cpu) { }
+static __init int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; }
+static __init void get_rtc_noop(struct timespec64 *now) { }
+
+static __initconst const struct of_device_id of_cmos_match[] = {
+ { .compatible = "motorola,mc146818" },
+ {}
+};
+
+/*
+ * Allow devicetree configured systems to disable the RTC by setting the
+ * corresponding DT node's status property to disabled. Code is optimized
+ * out for CONFIG_OF=n builds.
+ */
+static __init void x86_wallclock_init(void)
+{
+ struct device_node *node = of_find_matching_node(NULL, of_cmos_match);
+
+ if (node && !of_device_is_available(node)) {
+ x86_platform.get_wallclock = get_rtc_noop;
+ x86_platform.set_wallclock = set_rtc_noop;
+ }
+}
/*
* The platform setup functions are preset with the default functions
@@ -58,6 +80,7 @@ struct x86_init_ops x86_init __initdata = {
.pre_vector_init = init_ISA_irqs,
.intr_init = native_init_IRQ,
.trap_init = x86_init_noop,
+ .intr_mode_select = apic_intr_mode_select,
.intr_mode_init = apic_intr_mode_init
},
@@ -73,7 +96,7 @@ struct x86_init_ops x86_init __initdata = {
.timers = {
.setup_percpu_clockev = setup_boot_APIC_clock,
.timer_init = hpet_time_init,
- .wallclock_init = x86_init_noop,
+ .wallclock_init = x86_wallclock_init,
},
.iommu = {
@@ -95,6 +118,7 @@ struct x86_init_ops x86_init __initdata = {
},
.acpi = {
+ .set_root_pointer = x86_default_set_root_pointer,
.get_root_pointer = x86_default_get_root_pointer,
.reduced_hw_early_init = acpi_generic_reduced_hw_init,
},
OpenPOWER on IntegriCloud