diff options
Diffstat (limited to 'arch/x86/kernel')
139 files changed, 4406 insertions, 4925 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3578ad248bc9..9b294c13809a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -94,13 +94,16 @@ obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o +obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_DOUBLEFAULT) += doublefault.o +ifeq ($(CONFIG_X86_32),y) +obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o +endif obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o @@ -134,7 +137,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o -obj-$(CONFIG_X86_INTEL_UMIP) += umip.o +obj-$(CONFIG_X86_UMIP) += umip.o obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o @@ -146,7 +149,6 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_AUDIT) += audit_64.o obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o - obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o obj-y += vsmp_64.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 17b33ef604f3..04205ce127a1 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1760,6 +1760,11 @@ void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) e820__update_table_print(); } +void x86_default_set_root_pointer(u64 addr) +{ + boot_params.acpi_rsdp_addr = addr; +} + u64 x86_default_get_root_pointer(void) { return boot_params.acpi_rsdp_addr; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index ca13851f0570..26b7256f590f 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -27,6 +27,17 @@ static char temp_stack[4096]; #endif /** + * acpi_get_wakeup_address - provide physical address for S3 wakeup + * + * Returns the physical address where the kernel should be resumed after the + * system awakes from S3, e.g. for programming into the firmware waking vector. + */ +unsigned long acpi_get_wakeup_address(void) +{ + return ((unsigned long)(real_mode_header->wakeup_start)); +} + +/** * x86_acpi_enter_sleep_state - enter sleep state * @state: Sleep state to enter. * diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index fbb60ca4255c..d06c2079b6c1 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -3,7 +3,7 @@ * Variables and functions used by the code in sleep.c */ -#include <asm/realmode.h> +#include <linux/linkage.h> extern unsigned long saved_video_mode; extern long saved_magic; diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index e95e95960156..daf88f8143c5 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -9,8 +9,7 @@ .code32 ALIGN -ENTRY(wakeup_pmode_return) -wakeup_pmode_return: +SYM_CODE_START(wakeup_pmode_return) movw $__KERNEL_DS, %ax movw %ax, %ss movw %ax, %fs @@ -39,6 +38,7 @@ wakeup_pmode_return: # jump to place where we left off movl saved_eip, %eax jmp *%eax +SYM_CODE_END(wakeup_pmode_return) bogus_magic: jmp bogus_magic @@ -72,7 +72,7 @@ restore_registers: popfl ret -ENTRY(do_suspend_lowlevel) +SYM_CODE_START(do_suspend_lowlevel) call save_processor_state call save_registers pushl $3 @@ -87,10 +87,11 @@ ret_point: call restore_registers call restore_processor_state ret +SYM_CODE_END(do_suspend_lowlevel) .data ALIGN -ENTRY(saved_magic) .long 0 +SYM_DATA(saved_magic, .long 0) saved_eip: .long 0 # saved registers diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index b0715c3ac18d..c8daa92f38dc 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -14,12 +14,17 @@ /* * Hooray, we are in Long 64-bit mode (but still running in low memory) */ -ENTRY(wakeup_long64) +SYM_FUNC_START(wakeup_long64) movq saved_magic, %rax movq $0x123456789abcdef0, %rdx cmpq %rdx, %rax - jne bogus_64_magic + je 2f + /* stop here on a saved_magic mismatch */ + movq $0xbad6d61676963, %rcx +1: + jmp 1b +2: movw $__KERNEL_DS, %ax movw %ax, %ss movw %ax, %ds @@ -35,12 +40,9 @@ ENTRY(wakeup_long64) movq saved_rip, %rax jmp *%rax -ENDPROC(wakeup_long64) +SYM_FUNC_END(wakeup_long64) -bogus_64_magic: - jmp bogus_64_magic - -ENTRY(do_suspend_lowlevel) +SYM_FUNC_START(do_suspend_lowlevel) FRAME_BEGIN subq $8, %rsp xorl %eax, %eax @@ -123,7 +125,7 @@ ENTRY(do_suspend_lowlevel) addq $8, %rsp FRAME_END jmp restore_processor_state -ENDPROC(do_suspend_lowlevel) +SYM_FUNC_END(do_suspend_lowlevel) .data saved_rbp: .quad 0 @@ -134,4 +136,4 @@ saved_rbx: .quad 0 saved_rip: .quad 0 saved_rsp: .quad 0 -ENTRY(saved_magic) .quad 0 +SYM_DATA(saved_magic, .quad 0) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ccd32013c47a..15ac0d5f4b40 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -23,6 +23,7 @@ #include <asm/nmi.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include <asm/insn.h> #include <asm/io.h> #include <asm/fixmap.h> @@ -713,7 +714,7 @@ void __init alternative_instructions(void) * Don't stop machine check exceptions while patching. * MCEs only happen when something got corrupted and in this * case we must do something about the corruption. - * Ignoring it is worse than a unlikely patching race. + * Ignoring it is worse than an unlikely patching race. * Also machine checks tend to be broadcast and if one CPU * goes into machine check the others follow quickly, so we don't * expect a machine check to cause undue problems during to code @@ -753,8 +754,8 @@ void __init alternative_instructions(void) * When you use this code to patch more than one byte of an instruction * you need to make sure that other CPUs cannot execute this code in parallel. * Also no thread must be currently preempted in the middle of these - * instructions. And on the local CPU you need to be protected again NMI or MCE - * handlers seeing an inconsistent instruction while you patch. + * instructions. And on the local CPU you need to be protected against NMI or + * MCE handlers seeing an inconsistent instruction while you patch. */ void __init_or_module text_poke_early(void *addr, const void *opcode, size_t len) @@ -936,74 +937,139 @@ static void do_sync_core(void *info) sync_core(); } -static struct bp_patching_desc { +void text_poke_sync(void) +{ + on_each_cpu(do_sync_core, NULL, 1); +} + +struct text_poke_loc { + s32 rel_addr; /* addr := _stext + rel_addr */ + s32 rel32; + u8 opcode; + const u8 text[POKE_MAX_OPCODE_SIZE]; +}; + +struct bp_patching_desc { struct text_poke_loc *vec; int nr_entries; -} bp_patching; + atomic_t refs; +}; + +static struct bp_patching_desc *bp_desc; -static int patch_cmp(const void *key, const void *elt) +static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) +{ + struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */ + + if (!desc || !atomic_inc_not_zero(&desc->refs)) + return NULL; + + return desc; +} + +static inline void put_desc(struct bp_patching_desc *desc) +{ + smp_mb__before_atomic(); + atomic_dec(&desc->refs); +} + +static inline void *text_poke_addr(struct text_poke_loc *tp) +{ + return _stext + tp->rel_addr; +} + +static int notrace patch_cmp(const void *key, const void *elt) { struct text_poke_loc *tp = (struct text_poke_loc *) elt; - if (key < tp->addr) + if (key < text_poke_addr(tp)) return -1; - if (key > tp->addr) + if (key > text_poke_addr(tp)) return 1; return 0; } NOKPROBE_SYMBOL(patch_cmp); -int poke_int3_handler(struct pt_regs *regs) +int notrace poke_int3_handler(struct pt_regs *regs) { + struct bp_patching_desc *desc; struct text_poke_loc *tp; - unsigned char int3 = 0xcc; + int len, ret = 0; void *ip; + if (user_mode(regs)) + return 0; + /* * Having observed our INT3 instruction, we now must observe - * bp_patching.nr_entries. - * - * nr_entries != 0 INT3 - * WMB RMB - * write INT3 if (nr_entries) + * bp_desc: * - * Idem for other elements in bp_patching. + * bp_desc = desc INT3 + * WMB RMB + * write INT3 if (desc) */ smp_rmb(); - if (likely(!bp_patching.nr_entries)) - return 0; - - if (user_mode(regs)) + desc = try_get_desc(&bp_desc); + if (!desc) return 0; /* - * Discount the sizeof(int3). See text_poke_bp_batch(). + * Discount the INT3. See text_poke_bp_batch(). */ - ip = (void *) regs->ip - sizeof(int3); + ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. */ - if (unlikely(bp_patching.nr_entries > 1)) { - tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries, + if (unlikely(desc->nr_entries > 1)) { + tp = bsearch(ip, desc->vec, desc->nr_entries, sizeof(struct text_poke_loc), patch_cmp); if (!tp) - return 0; + goto out_put; } else { - tp = bp_patching.vec; - if (tp->addr != ip) - return 0; + tp = desc->vec; + if (text_poke_addr(tp) != ip) + goto out_put; } - /* set up the specified breakpoint detour */ - regs->ip = (unsigned long) tp->detour; + len = text_opcode_size(tp->opcode); + ip += len; - return 1; + switch (tp->opcode) { + case INT3_INSN_OPCODE: + /* + * Someone poked an explicit INT3, they'll want to handle it, + * do not consume. + */ + goto out_put; + + case CALL_INSN_OPCODE: + int3_emulate_call(regs, (long)ip + tp->rel32); + break; + + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + int3_emulate_jmp(regs, (long)ip + tp->rel32); + break; + + default: + BUG(); + } + + ret = 1; + +out_put: + put_desc(desc); + return ret; } NOKPROBE_SYMBOL(poke_int3_handler); +#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) +static struct text_poke_loc tp_vec[TP_VEC_MAX]; +static int tp_vec_nr; + /** * text_poke_bp_batch() -- update instructions on live kernel on SMP * @tp: vector of instructions to patch @@ -1014,7 +1080,7 @@ NOKPROBE_SYMBOL(poke_int3_handler); * synchronization using int3 breakpoint. * * The way it is done: - * - For each entry in the vector: + * - For each entry in the vector: * - add a int3 trap to the address that will be patched * - sync cores * - For each entry in the vector: @@ -1025,16 +1091,20 @@ NOKPROBE_SYMBOL(poke_int3_handler); * replacing opcode * - sync cores */ -void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) +static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { - int patched_all_but_first = 0; - unsigned char int3 = 0xcc; + struct bp_patching_desc desc = { + .vec = tp, + .nr_entries = nr_entries, + .refs = ATOMIC_INIT(1), + }; + unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; + int do_sync; lockdep_assert_held(&text_mutex); - bp_patching.vec = tp; - bp_patching.nr_entries = nr_entries; + smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */ /* * Corresponding read barrier in int3 notifier for making sure the @@ -1046,45 +1116,153 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) * First step: add a int3 trap to the address that will be patched. */ for (i = 0; i < nr_entries; i++) - text_poke(tp[i].addr, &int3, sizeof(int3)); + text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); - on_each_cpu(do_sync_core, NULL, 1); + text_poke_sync(); /* * Second step: update all but the first byte of the patched range. */ - for (i = 0; i < nr_entries; i++) { - if (tp[i].len - sizeof(int3) > 0) { - text_poke((char *)tp[i].addr + sizeof(int3), - (const char *)tp[i].opcode + sizeof(int3), - tp[i].len - sizeof(int3)); - patched_all_but_first++; + for (do_sync = 0, i = 0; i < nr_entries; i++) { + int len = text_opcode_size(tp[i].opcode); + + if (len - INT3_INSN_SIZE > 0) { + text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + (const char *)tp[i].text + INT3_INSN_SIZE, + len - INT3_INSN_SIZE); + do_sync++; } } - if (patched_all_but_first) { + if (do_sync) { /* * According to Intel, this core syncing is very likely * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ - on_each_cpu(do_sync_core, NULL, 1); + text_poke_sync(); } /* * Third step: replace the first byte (int3) by the first byte of * replacing opcode. */ - for (i = 0; i < nr_entries; i++) - text_poke(tp[i].addr, tp[i].opcode, sizeof(int3)); + for (do_sync = 0, i = 0; i < nr_entries; i++) { + if (tp[i].text[0] == INT3_INSN_OPCODE) + continue; + + text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE); + do_sync++; + } + + if (do_sync) + text_poke_sync(); - on_each_cpu(do_sync_core, NULL, 1); /* - * sync_core() implies an smp_mb() and orders this store against - * the writing of the new instruction. + * Remove and synchronize_rcu(), except we have a very primitive + * refcount based completion. */ - bp_patching.vec = NULL; - bp_patching.nr_entries = 0; + WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */ + if (!atomic_dec_and_test(&desc.refs)) + atomic_cond_read_acquire(&desc.refs, !VAL); +} + +void text_poke_loc_init(struct text_poke_loc *tp, void *addr, + const void *opcode, size_t len, const void *emulate) +{ + struct insn insn; + + memcpy((void *)tp->text, opcode, len); + if (!emulate) + emulate = opcode; + + kernel_insn_init(&insn, emulate, MAX_INSN_SIZE); + insn_get_length(&insn); + + BUG_ON(!insn_complete(&insn)); + BUG_ON(len != insn.length); + + tp->rel_addr = addr - (void *)_stext; + tp->opcode = insn.opcode.bytes[0]; + + switch (tp->opcode) { + case INT3_INSN_OPCODE: + break; + + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + tp->rel32 = insn.immediate.value; + break; + + default: /* assume NOP */ + switch (len) { + case 2: /* NOP2 -- emulate as JMP8+0 */ + BUG_ON(memcmp(emulate, ideal_nops[len], len)); + tp->opcode = JMP8_INSN_OPCODE; + tp->rel32 = 0; + break; + + case 5: /* NOP5 -- emulate as JMP32+0 */ + BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len)); + tp->opcode = JMP32_INSN_OPCODE; + tp->rel32 = 0; + break; + + default: /* unknown instruction */ + BUG(); + } + break; + } +} + +/* + * We hard rely on the tp_vec being ordered; ensure this is so by flushing + * early if needed. + */ +static bool tp_order_fail(void *addr) +{ + struct text_poke_loc *tp; + + if (!tp_vec_nr) + return false; + + if (!addr) /* force */ + return true; + + tp = &tp_vec[tp_vec_nr - 1]; + if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) + return true; + + return false; +} + +static void text_poke_flush(void *addr) +{ + if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { + text_poke_bp_batch(tp_vec, tp_vec_nr); + tp_vec_nr = 0; + } +} + +void text_poke_finish(void) +{ + text_poke_flush(NULL); +} + +void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) +{ + struct text_poke_loc *tp; + + if (unlikely(system_state == SYSTEM_BOOTING)) { + text_poke_early(addr, opcode, len); + return; + } + + text_poke_flush(addr); + + tp = &tp_vec[tp_vec_nr++]; + text_poke_loc_init(tp, addr, opcode, len, emulate); } /** @@ -1098,20 +1276,15 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) * dynamically allocated memory. This function should be used when it is * not possible to allocate memory. */ -void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc tp = { - .detour = handler, - .addr = addr, - .len = len, - }; + struct text_poke_loc tp; - if (len > POKE_MAX_OPCODE_SIZE) { - WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE); + if (unlikely(system_state == SYSTEM_BOOTING)) { + text_poke_early(addr, opcode, len); return; } - memcpy((void *)tp.opcode, opcode, len); - + text_poke_loc_init(&tp, addr, opcode, len, emulate); text_poke_bp_batch(&tp, 1); } diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index a585ea6f686a..4e5f50236048 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -185,13 +185,13 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { - return force_iommu || !dma_capable(dev, addr, size); + return force_iommu || !dma_capable(dev, addr, size, true); } static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { - return !dma_capable(dev, addr, size); + return !dma_capable(dev, addr, size, true); } /* Map a single continuous physical area into the IOMMU. @@ -510,10 +510,9 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; if (iommu_size < 64*1024*1024) { - pr_warning( - "PCI-DMA: Warning: Small IOMMU %luMB." + pr_warn("PCI-DMA: Warning: Small IOMMU %luMB." " Consider increasing the AGP aperture in BIOS\n", - iommu_size >> 20); + iommu_size >> 20); } return iommu_size; @@ -665,8 +664,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info) nommu: /* Should not happen anymore */ - pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n" - "falling back to iommu=soft.\n"); + pr_warn("PCI-DMA: More than 4GB of RAM and no IOMMU - falling back to iommu=soft.\n"); return -1; } @@ -677,7 +675,10 @@ static const struct dma_map_ops gart_dma_ops = { .unmap_page = gart_unmap_page, .alloc = gart_alloc_coherent, .free = gart_free_coherent, + .mmap = dma_common_mmap, + .get_sgtable = dma_common_get_sgtable, .dma_supported = dma_direct_supported, + .get_required_mask = dma_direct_get_required_mask, }; static void gart_iommu_shutdown(void) @@ -730,8 +731,8 @@ int __init gart_iommu_init(void) !gart_iommu_aperture || (no_agp && init_amd_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { - pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); - pr_warning("falling back to iommu=soft.\n"); + pr_warn("More than 4GB of memory but GART IOMMU not available.\n"); + pr_warn("falling back to iommu=soft.\n"); } return 0; } diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index d63e63b7d1d9..69aed0ebbdfc 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -21,6 +21,8 @@ #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 +#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 +#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 /* Protect the PCI config register pairs used for SMN and DF indirect access. */ static DEFINE_MUTEX(smn_mutex); @@ -50,6 +52,8 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, {} }; EXPORT_SYMBOL_GPL(amd_nb_misc_ids); @@ -63,6 +67,8 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, {} }; diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 5da106f84e84..fe698f96617c 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -95,7 +95,7 @@ static inline void apbt_set_mapping(void) printk(KERN_WARNING "No timer base from SFI, use default\n"); apbt_address = APBT_DEFAULT_BASE; } - apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); + apbt_virt_address = ioremap(apbt_address, APBT_MMAP_SIZE); if (!apbt_virt_address) { pr_debug("Failed mapping APBT phy address at %lu\n",\ (unsigned long)apbt_address); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f5291362da1a..5f973fed3c9f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -65,10 +65,10 @@ unsigned int num_processors; unsigned disabled_cpus; /* Processor that is doing the boot up */ -unsigned int boot_cpu_physical_apicid = -1U; +unsigned int boot_cpu_physical_apicid __ro_after_init = -1U; EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); -u8 boot_cpu_apic_version; +u8 boot_cpu_apic_version __ro_after_init; /* * The highest APIC ID seen during enumeration. @@ -85,13 +85,13 @@ physid_mask_t phys_cpu_present_map; * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to * avoid undefined behaviour caused by sending INIT from AP to BSP. */ -static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID; +static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID; /* * This variable controls which CPUs receive external NMIs. By default, * external NMIs are delivered only to the BSP. */ -static int apic_extnmi = APIC_EXTNMI_BSP; +static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP; /* * Map cpu index to physical APIC ID @@ -114,7 +114,7 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); /* Local APIC was disabled by the BIOS and enabled by the kernel */ -static int enabled_via_apicbase; +static int enabled_via_apicbase __ro_after_init; /* * Handle interrupt mode configuration register (IMCR). @@ -172,23 +172,23 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif -unsigned long mp_lapic_addr; -int disable_apic; +unsigned long mp_lapic_addr __ro_after_init; +int disable_apic __ro_after_init; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ static int disable_apic_timer __initdata; /* Local APIC timer works in C2 */ -int local_apic_timer_c2_ok; +int local_apic_timer_c2_ok __ro_after_init; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); /* * Debug level, exported for io_apic.c */ -int apic_verbosity; +int apic_verbosity __ro_after_init; -int pic_mode; +int pic_mode __ro_after_init; /* Have we found an MP table */ -int smp_found_config; +int smp_found_config __ro_after_init; static struct resource lapic_resource = { .name = "Local APIC", @@ -199,7 +199,7 @@ unsigned int lapic_timer_period = 0; static void apic_pm_activate(void); -static unsigned long apic_phys; +static unsigned long apic_phys __ro_after_init; /* * Get the LAPIC version @@ -590,21 +590,21 @@ static u32 skx_deadline_rev(void) static const struct x86_cpu_id deadline_match[] = { DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), - DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev), + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_D, bdx_deadline_rev), DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_GT3E, 0x17), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL, 0x22), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_L, 0x20), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_G, 0x17), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_CORE, 0x25), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_GT3E, 0x17), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL, 0x25), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_G, 0x17), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_MOBILE, 0xb2), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_DESKTOP, 0xb2), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_L, 0xb2), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE, 0xb2), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_MOBILE, 0x52), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_DESKTOP, 0x52), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_L, 0x52), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE, 0x52), {}, }; @@ -722,7 +722,7 @@ static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; /* - * Temporary interrupt handler. + * Temporary interrupt handler and polled calibration function. */ static void __init lapic_cal_handler(struct clock_event_device *dev) { @@ -780,8 +780,8 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) res = (((u64)deltapm) * mult) >> 22; do_div(res, 1000000); - pr_warning("APIC calibration not consistent " - "with PM-Timer: %ldms instead of 100ms\n",(long)res); + pr_warn("APIC calibration not consistent " + "with PM-Timer: %ldms instead of 100ms\n", (long)res); /* Correct the lapic counter value */ res = (((u64)(*delta)) * pm_100ms); @@ -830,8 +830,21 @@ bool __init apic_needs_pit(void) if (!tsc_khz || !cpu_khz) return true; - /* Is there an APIC at all? */ - if (!boot_cpu_has(X86_FEATURE_APIC)) + /* Is there an APIC at all or is it disabled? */ + if (!boot_cpu_has(X86_FEATURE_APIC) || disable_apic) + return true; + + /* + * If interrupt delivery mode is legacy PIC or virtual wire without + * configuration, the local APIC timer wont be set up. Make sure + * that the PIT is initialized. + */ + if (apic_intr_mode == APIC_PIC || + apic_intr_mode == APIC_VIRTUAL_WIRE_NO_CONFIG) + return true; + + /* Virt guests may lack ARAT, but still have DEADLINE */ + if (!boot_cpu_has(X86_FEATURE_ARAT)) return true; /* Deadline timer is based on TSC so no further PIT action required */ @@ -851,7 +864,8 @@ bool __init apic_needs_pit(void) static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = this_cpu_ptr(&lapic_events); - void (*real_handler)(struct clock_event_device *dev); + u64 tsc_perj = 0, tsc_start = 0; + unsigned long jif_start; unsigned long deltaj; long delta, deltatsc; int pm_referenced = 0; @@ -878,28 +892,64 @@ static int __init calibrate_APIC_clock(void) apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" "calibrating APIC timer ...\n"); + /* + * There are platforms w/o global clockevent devices. Instead of + * making the calibration conditional on that, use a polling based + * approach everywhere. + */ local_irq_disable(); - /* Replace the global interrupt handler */ - real_handler = global_clock_event->event_handler; - global_clock_event->event_handler = lapic_cal_handler; - /* * Setup the APIC counter to maximum. There is no way the lapic * can underflow in the 100ms detection time frame */ __setup_APIC_LVTT(0xffffffff, 0, 0); - /* Let the interrupts run */ + /* + * Methods to terminate the calibration loop: + * 1) Global clockevent if available (jiffies) + * 2) TSC if available and frequency is known + */ + jif_start = READ_ONCE(jiffies); + + if (tsc_khz) { + tsc_start = rdtsc(); + tsc_perj = div_u64((u64)tsc_khz * 1000, HZ); + } + + /* + * Enable interrupts so the tick can fire, if a global + * clockevent device is available + */ local_irq_enable(); - while (lapic_cal_loops <= LAPIC_CAL_LOOPS) - cpu_relax(); + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) { + /* Wait for a tick to elapse */ + while (1) { + if (tsc_khz) { + u64 tsc_now = rdtsc(); + if ((tsc_now - tsc_start) >= tsc_perj) { + tsc_start += tsc_perj; + break; + } + } else { + unsigned long jif_now = READ_ONCE(jiffies); - local_irq_disable(); + if (time_after(jif_now, jif_start)) { + jif_start = jif_now; + break; + } + } + cpu_relax(); + } - /* Restore the real event handler */ - global_clock_event->event_handler = real_handler; + /* Invoke the calibration routine */ + local_irq_disable(); + lapic_cal_handler(NULL); + local_irq_enable(); + } + + local_irq_disable(); /* Build delta t1-t2 as apic timer counts down */ delta = lapic_cal_t1 - lapic_cal_t2; @@ -936,17 +986,18 @@ static int __init calibrate_APIC_clock(void) */ if (lapic_timer_period < (1000000 / HZ)) { local_irq_enable(); - pr_warning("APIC frequency too slow, disabling apic timer\n"); + pr_warn("APIC frequency too slow, disabling apic timer\n"); return -1; } levt->features &= ~CLOCK_EVT_FEAT_DUMMY; /* - * PM timer calibration failed or not turned on - * so lets try APIC timer based calibration + * PM timer calibration failed or not turned on so lets try APIC + * timer based calibration, if a global clockevent device is + * available. */ - if (!pm_referenced) { + if (!pm_referenced && global_clock_event) { apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); /* @@ -979,7 +1030,7 @@ static int __init calibrate_APIC_clock(void) local_irq_enable(); if (levt->features & CLOCK_EVT_FEAT_DUMMY) { - pr_warning("APIC timer disabled due to verification failure\n"); + pr_warn("APIC timer disabled due to verification failure\n"); return -1; } @@ -1053,8 +1104,8 @@ static void local_apic_timer_interrupt(void) * spurious. */ if (!evt->event_handler) { - pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", - smp_processor_id()); + pr_warn("Spurious LAPIC timer interrupt on cpu %d\n", + smp_processor_id()); /* Switch it off */ lapic_timer_shutdown(evt); return; @@ -1182,25 +1233,38 @@ void clear_local_APIC(void) } /** - * disable_local_APIC - clear and disable the local APIC + * apic_soft_disable - Clears and software disables the local APIC on hotplug + * + * Contrary to disable_local_APIC() this does not touch the enable bit in + * MSR_IA32_APICBASE. Clearing that bit on systems based on the 3 wire APIC + * bus would require a hardware reset as the APIC would lose track of bus + * arbitration. On systems with FSB delivery APICBASE could be disabled, + * but it has to be guaranteed that no interrupt is sent to the APIC while + * in that state and it's not clear from the SDM whether it still responds + * to INIT/SIPI messages. Stay on the safe side and use software disable. */ -void disable_local_APIC(void) +void apic_soft_disable(void) { - unsigned int value; - - /* APIC hasn't been mapped yet */ - if (!x2apic_mode && !apic_phys) - return; + u32 value; clear_local_APIC(); - /* - * Disable APIC (implies clearing of registers - * for 82489DX!). - */ + /* Soft disable APIC (implies clearing of registers for 82489DX!). */ value = apic_read(APIC_SPIV); value &= ~APIC_SPIV_APIC_ENABLED; apic_write(APIC_SPIV, value); +} + +/** + * disable_local_APIC - clear and disable the local APIC + */ +void disable_local_APIC(void) +{ + /* APIC hasn't been mapped yet */ + if (!x2apic_mode && !apic_phys) + return; + + apic_soft_disable(); #ifdef CONFIG_X86_32 /* @@ -1265,9 +1329,9 @@ void __init sync_Arb_IDs(void) APIC_INT_LEVELTRIG | APIC_DM_INIT); } -enum apic_intr_mode_id apic_intr_mode; +enum apic_intr_mode_id apic_intr_mode __ro_after_init; -static int __init apic_intr_mode_select(void) +static int __init __apic_intr_mode_select(void) { /* Check kernel option */ if (disable_apic) { @@ -1329,6 +1393,12 @@ static int __init apic_intr_mode_select(void) return APIC_SYMMETRIC_IO; } +/* Select the interrupt delivery mode for the BSP */ +void __init apic_intr_mode_select(void) +{ + apic_intr_mode = __apic_intr_mode_select(); +} + /* * An initial setup of the virtual wire mode. */ @@ -1385,8 +1455,6 @@ void __init apic_intr_mode_init(void) { bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT); - apic_intr_mode = apic_intr_mode_select(); - switch (apic_intr_mode) { case APIC_PIC: pr_info("APIC: Keep in PIC mode(8259)\n"); @@ -1453,54 +1521,72 @@ static void lapic_setup_esr(void) oldvalue, value); } -static void apic_pending_intr_clear(void) +#define APIC_IR_REGS APIC_ISR_NR +#define APIC_IR_BITS (APIC_IR_REGS * 32) +#define APIC_IR_MAPSIZE (APIC_IR_BITS / BITS_PER_LONG) + +union apic_ir { + unsigned long map[APIC_IR_MAPSIZE]; + u32 regs[APIC_IR_REGS]; +}; + +static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) { - long long max_loops = cpu_khz ? cpu_khz : 1000000; - unsigned long long tsc = 0, ntsc; - unsigned int queued; - unsigned long value; - int i, j, acked = 0; + int i, bit; + + /* Read the IRRs */ + for (i = 0; i < APIC_IR_REGS; i++) + irr->regs[i] = apic_read(APIC_IRR + i * 0x10); + + /* Read the ISRs */ + for (i = 0; i < APIC_IR_REGS; i++) + isr->regs[i] = apic_read(APIC_ISR + i * 0x10); - if (boot_cpu_has(X86_FEATURE_TSC)) - tsc = rdtsc(); /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. - * - * Most probably by now CPU has serviced that pending interrupt and - * it might not have done the ack_APIC_irq() because it thought, - * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it - * does not clear the ISR bit and cpu thinks it has already serivced - * the interrupt. Hence a vector might get locked. It was noticed - * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. + * If the ISR map is not empty. ACK the APIC and run another round + * to verify whether a pending IRR has been unblocked and turned + * into a ISR. */ - do { - queued = 0; - for (i = APIC_ISR_NR - 1; i >= 0; i--) - queued |= apic_read(APIC_IRR + i*0x10); - - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for_each_set_bit(j, &value, 32) { - ack_APIC_irq(); - acked++; - } - } - if (acked > 256) { - pr_err("LAPIC pending interrupts after %d EOI\n", acked); - break; - } - if (queued) { - if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) { - ntsc = rdtsc(); - max_loops = (long long)cpu_khz << 10; - max_loops -= ntsc - tsc; - } else { - max_loops--; - } - } - } while (queued && max_loops > 0); - WARN_ON(max_loops <= 0); + if (!bitmap_empty(isr->map, APIC_IR_BITS)) { + /* + * There can be multiple ISR bits set when a high priority + * interrupt preempted a lower priority one. Issue an ACK + * per set bit. + */ + for_each_set_bit(bit, isr->map, APIC_IR_BITS) + ack_APIC_irq(); + return true; + } + + return !bitmap_empty(irr->map, APIC_IR_BITS); +} + +/* + * After a crash, we no longer service the interrupts and a pending + * interrupt from previous kernel might still have ISR bit set. + * + * Most probably by now the CPU has serviced that pending interrupt and it + * might not have done the ack_APIC_irq() because it thought, interrupt + * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear + * the ISR bit and cpu thinks it has already serivced the interrupt. Hence + * a vector might get locked. It was noticed for timer irq (vector + * 0x31). Issue an extra EOI to clear ISR. + * + * If there are pending IRR bits they turn into ISR bits after a higher + * priority ISR bit has been acked. + */ +static void apic_pending_intr_clear(void) +{ + union apic_ir irr, isr; + unsigned int i; + + /* 512 loops are way oversized and give the APIC a chance to obey. */ + for (i = 0; i < 512; i++) { + if (!apic_check_and_ack(&irr, &isr)) + return; + } + /* Dump the IRR/ISR content if that failed */ + pr_warn("APIC: Stale IRR: %256pb ISR: %256pb\n", irr.map, isr.map); } /** @@ -1513,16 +1599,20 @@ static void setup_local_APIC(void) { int cpu = smp_processor_id(); unsigned int value; -#ifdef CONFIG_X86_32 - int logical_apicid, ldr_apicid; -#endif - if (disable_apic) { disable_ioapic_support(); return; } + /* + * If this comes from kexec/kcrash the APIC might be enabled in + * SPIV. Soft disable it before doing further initialization. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write(APIC_SPIV, value); + #ifdef CONFIG_X86_32 /* Pound the ESR really hard over the head with a big hammer - mbligh */ if (lapic_is_integrated() && apic->disable_esr) { @@ -1532,8 +1622,6 @@ static void setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - perf_events_lapic_init(); - /* * Double-check whether this APIC is really registered. * This is meaningless in clustered apic mode, so we skip it. @@ -1548,26 +1636,35 @@ static void setup_local_APIC(void) apic->init_apic_ldr(); #ifdef CONFIG_X86_32 - /* - * APIC LDR is initialized. If logical_apicid mapping was - * initialized during get_smp_config(), make sure it matches the - * actual value. - */ - logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); - WARN_ON(logical_apicid != BAD_APICID && logical_apicid != ldr_apicid); - /* always use the value from LDR */ - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; + if (apic->dest_logical) { + int logical_apicid, ldr_apicid; + + /* + * APIC LDR is initialized. If logical_apicid mapping was + * initialized during get_smp_config(), make sure it matches + * the actual value. + */ + logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + if (logical_apicid != BAD_APICID) + WARN_ON(logical_apicid != ldr_apicid); + /* Always use the value from LDR. */ + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; + } #endif /* - * Set Task Priority to 'accept all'. We never change this - * later on. + * Set Task Priority to 'accept all except vectors 0-31'. An APIC + * vector in the 16-31 range could be delivered if TPR == 0, but we + * would think it's an exception and terrible things will happen. We + * never change this later on. */ value = apic_read(APIC_TASKPRI); value &= ~APIC_TPRI_MASK; + value |= 0x10; apic_write(APIC_TASKPRI, value); + /* Clear eventually stale ISR/IRR bits */ apic_pending_intr_clear(); /* @@ -1614,6 +1711,8 @@ static void setup_local_APIC(void) value |= SPURIOUS_APIC_VECTOR; apic_write(APIC_SPIV, value); + perf_events_lapic_init(); + /* * Set up LVT0, LVT1: * @@ -1725,11 +1824,11 @@ static int __init setup_nox2apic(char *str) int apicid = native_apic_msr_read(APIC_ID); if (apicid >= 255) { - pr_warning("Apicid: %08x, cannot enforce nox2apic\n", - apicid); + pr_warn("Apicid: %08x, cannot enforce nox2apic\n", + apicid); return 0; } - pr_warning("x2apic already enabled.\n"); + pr_warn("x2apic already enabled.\n"); __x2apic_disable(); } setup_clear_cpu_cap(X86_FEATURE_X2APIC); @@ -1897,7 +1996,7 @@ static int __init apic_verify(void) */ features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { - pr_warning("Could not enable APIC!\n"); + pr_warn("Could not enable APIC!\n"); return -1; } set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); @@ -2251,7 +2350,7 @@ static int cpuid_to_apicid[] = { #ifdef CONFIG_SMP /** * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread - * @id: APIC ID to check + * @apicid: APIC ID to check */ bool apic_id_is_primary_thread(unsigned int apicid) { @@ -2324,9 +2423,8 @@ int generic_processor_info(int apicid, int version) disabled_cpu_apicid == apicid) { int thiscpu = num_processors + disabled_cpus; - pr_warning("APIC: Disabling requested cpu." - " Processor %d/0x%x ignored.\n", - thiscpu, apicid); + pr_warn("APIC: Disabling requested cpu." + " Processor %d/0x%x ignored.\n", thiscpu, apicid); disabled_cpus++; return -ENODEV; @@ -2340,8 +2438,7 @@ int generic_processor_info(int apicid, int version) apicid != boot_cpu_physical_apicid) { int thiscpu = max + disabled_cpus - 1; - pr_warning( - "APIC: NR_CPUS/possible_cpus limit of %i almost" + pr_warn("APIC: NR_CPUS/possible_cpus limit of %i almost" " reached. Keeping one slot for boot cpu." " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); @@ -2352,9 +2449,8 @@ int generic_processor_info(int apicid, int version) if (num_processors >= nr_cpu_ids) { int thiscpu = max + disabled_cpus; - pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " - "reached. Processor %d/0x%x ignored.\n", - max, thiscpu, apicid); + pr_warn("APIC: NR_CPUS/possible_cpus limit of %i reached. " + "Processor %d/0x%x ignored.\n", max, thiscpu, apicid); disabled_cpus++; return -EINVAL; @@ -2384,13 +2480,13 @@ int generic_processor_info(int apicid, int version) * Validate version */ if (version == 0x0) { - pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", - cpu, apicid); + pr_warn("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", + cpu, apicid); version = 0x10; } if (version != boot_cpu_apic_version) { - pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", + pr_warn("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", boot_cpu_apic_version, cpu, version); } @@ -2543,6 +2639,13 @@ static int lapic_suspend(void) #endif local_irq_save(flags); + + /* + * Mask IOAPIC before disabling the local APIC to prevent stale IRR + * entries on some implementations. + */ + mask_ioapic_entries(); + disable_local_APIC(); irq_remapping_disable(); @@ -2759,7 +2862,7 @@ static int __init apic_set_verbosity(char *arg) apic_verbosity = APIC_VERBOSE; #ifdef CONFIG_X86_64 else { - pr_warning("APIC Verbosity level %s not recognised" + pr_warn("APIC Verbosity level %s not recognised" " use apic=verbose or apic=debug\n", arg); return -EINVAL; } diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index bbdca603f94a..7862b152a052 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -8,21 +8,14 @@ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ -#include <linux/acpi.h> -#include <linux/errno.h> -#include <linux/threads.h> #include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/hardirq.h> #include <linux/export.h> +#include <linux/acpi.h> -#include <asm/smp.h> -#include <asm/ipi.h> -#include <asm/apic.h> -#include <asm/apic_flat_64.h> #include <asm/jailhouse_para.h> +#include <asm/apic.h> + +#include "local.h" static struct apic apic_physflat; static struct apic apic_flat; @@ -83,35 +76,6 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) _flat_send_IPI_mask(mask, vector); } -static void flat_send_IPI_allbutself(int vector) -{ - int cpu = smp_processor_id(); - - if (IS_ENABLED(CONFIG_HOTPLUG_CPU) || vector == NMI_VECTOR) { - if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) { - unsigned long mask = cpumask_bits(cpu_online_mask)[0]; - - if (cpu < BITS_PER_LONG) - __clear_bit(cpu, &mask); - - _flat_send_IPI_mask(mask, vector); - } - } else if (num_online_cpus() > 1) { - __default_send_IPI_shortcut(APIC_DEST_ALLBUT, - vector, apic->dest_logical); - } -} - -static void flat_send_IPI_all(int vector) -{ - if (vector == NMI_VECTOR) { - flat_send_IPI_mask(cpu_online_mask, vector); - } else { - __default_send_IPI_shortcut(APIC_DEST_ALLINC, - vector, apic->dest_logical); - } -} - static unsigned int flat_get_apic_id(unsigned long x) { return (x >> 24) & 0xFF; @@ -173,9 +137,9 @@ static struct apic apic_flat __ro_after_init = { .send_IPI = default_send_IPI_single, .send_IPI_mask = flat_send_IPI_mask, .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, - .send_IPI_allbutself = flat_send_IPI_allbutself, - .send_IPI_all = flat_send_IPI_all, - .send_IPI_self = apic_send_IPI_self, + .send_IPI_allbutself = default_send_IPI_allbutself, + .send_IPI_all = default_send_IPI_all, + .send_IPI_self = default_send_IPI_self, .inquire_remote_apic = default_inquire_remote_apic, @@ -225,16 +189,6 @@ static void physflat_init_apic_ldr(void) */ } -static void physflat_send_IPI_allbutself(int vector) -{ - default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); -} - -static void physflat_send_IPI_all(int vector) -{ - default_send_IPI_mask_sequence_phys(cpu_online_mask, vector); -} - static int physflat_probe(void) { if (apic == &apic_physflat || num_possible_cpus() > 8 || @@ -276,9 +230,9 @@ static struct apic apic_physflat __ro_after_init = { .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys, - .send_IPI_allbutself = physflat_send_IPI_allbutself, - .send_IPI_all = physflat_send_IPI_all, - .send_IPI_self = apic_send_IPI_self, + .send_IPI_allbutself = default_send_IPI_allbutself, + .send_IPI_all = default_send_IPI_all, + .send_IPI_self = default_send_IPI_self, .inquire_remote_apic = default_inquire_remote_apic, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 5078b5ce63a7..98c9bb75d185 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -9,25 +9,9 @@ * to not uglify the caller's code and allow to call (some) apic routines * like self-ipi, etc... */ - -#include <linux/threads.h> #include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/errno.h> -#include <asm/fixmap.h> -#include <asm/mpspec.h> -#include <asm/apicdef.h> -#include <asm/apic.h> -#include <asm/setup.h> -#include <linux/smp.h> -#include <asm/ipi.h> - -#include <linux/interrupt.h> -#include <asm/acpi.h> -#include <asm/e820/api.h> +#include <asm/apic.h> static void noop_init_apic_ldr(void) { } static void noop_send_IPI(int cpu, int vector) { } diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index a5464b8b6c46..cdf45b4700f2 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -10,15 +10,15 @@ * Send feedback to <support@numascale.com> * */ - +#include <linux/types.h> #include <linux/init.h> #include <asm/numachip/numachip.h> #include <asm/numachip/numachip_csr.h> -#include <asm/ipi.h> -#include <asm/apic_flat_64.h> + #include <asm/pgtable.h> -#include <asm/pci_x86.h> + +#include "local.h" u8 numachip_system __read_mostly; static const struct apic apic_numachip1; diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index afee386ff711..38b5b51d42f6 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -4,18 +4,13 @@ * * Drives the local APIC in "clustered mode". */ -#include <linux/threads.h> #include <linux/cpumask.h> -#include <linux/kernel.h> -#include <linux/init.h> #include <linux/dmi.h> #include <linux/smp.h> -#include <asm/apicdef.h> -#include <asm/fixmap.h> -#include <asm/mpspec.h> #include <asm/apic.h> -#include <asm/ipi.h> + +#include "local.h" static unsigned bigsmp_get_apic_id(unsigned long x) { @@ -38,32 +33,12 @@ static int bigsmp_early_logical_apicid(int cpu) return early_per_cpu(x86_cpu_to_apicid, cpu); } -static inline unsigned long calculate_ldr(int cpu) -{ - unsigned long val, id; - - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - id = per_cpu(x86_bios_cpu_apicid, cpu); - val |= SET_APIC_LOGICAL_ID(id); - - return val; -} - /* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... + * bigsmp enables physical destination mode + * and doesn't use LDR and DFR */ static void bigsmp_init_apic_ldr(void) { - unsigned long val; - int cpu = smp_processor_id(); - - apic_write(APIC_DFR, APIC_DFR_FLAT); - val = calculate_ldr(cpu); - apic_write(APIC_LDR, val); } static void bigsmp_setup_apic_routing(void) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c7bb6c69f21c..913c88617848 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1725,19 +1725,20 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data) return false; } -static inline bool ioapic_irqd_mask(struct irq_data *data) +static inline bool ioapic_prepare_move(struct irq_data *data) { - /* If we are moving the irq we need to mask it */ + /* If we are moving the IRQ we need to mask it */ if (unlikely(irqd_is_setaffinity_pending(data))) { - mask_ioapic_irq(data); + if (!irqd_irq_masked(data)) + mask_ioapic_irq(data); return true; } return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) +static inline void ioapic_finish_move(struct irq_data *data, bool moveit) { - if (unlikely(masked)) { + if (unlikely(moveit)) { /* Only migrate the irq if the ack has been received. * * On rare occasions the broadcast level triggered ack gets @@ -1766,15 +1767,17 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) */ if (!io_apic_level_ack_pending(data->chip_data)) irq_move_masked_irq(data); - unmask_ioapic_irq(data); + /* If the IRQ is masked in the core, leave it: */ + if (!irqd_irq_masked(data)) + unmask_ioapic_irq(data); } } #else -static inline bool ioapic_irqd_mask(struct irq_data *data) +static inline bool ioapic_prepare_move(struct irq_data *data) { return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) +static inline void ioapic_finish_move(struct irq_data *data, bool moveit) { } #endif @@ -1783,11 +1786,11 @@ static void ioapic_ack_level(struct irq_data *irq_data) { struct irq_cfg *cfg = irqd_cfg(irq_data); unsigned long v; - bool masked; + bool moveit; int i; irq_complete_move(cfg); - masked = ioapic_irqd_mask(irq_data); + moveit = ioapic_prepare_move(irq_data); /* * It appears there is an erratum which affects at least version 0x11 @@ -1842,7 +1845,7 @@ static void ioapic_ack_level(struct irq_data *irq_data) eoi_ioapic_pin(cfg->vector, irq_data->chip_data); } - ioapic_irqd_unmask(irq_data, masked); + ioapic_finish_move(irq_data, moveit); } static void ioapic_ir_ack_level(struct irq_data *irq_data) @@ -2438,7 +2441,13 @@ unsigned int arch_dynirq_lower_bound(unsigned int from) * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use * gsi_top if ioapic_dynirq_base hasn't been initialized yet. */ - return ioapic_initialized ? ioapic_dynirq_base : gsi_top; + if (!ioapic_initialized) + return gsi_top; + /* + * For DT enabled machines ioapic_dynirq_base is irrelevant and not + * updated. So simply return @from if ioapic_dynirq_base == 0. + */ + return ioapic_dynirq_base ? : from; } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 82f9244fe61f..6ca0f91372fd 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -1,24 +1,113 @@ // SPDX-License-Identifier: GPL-2.0 + #include <linux/cpumask.h> -#include <linux/interrupt.h> - -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/cache.h> -#include <linux/cpu.h> - -#include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -#include <asm/apic.h> -#include <asm/proto.h> -#include <asm/ipi.h> - -void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) +#include <linux/smp.h> + +#include "local.h" + +DEFINE_STATIC_KEY_FALSE(apic_use_ipi_shorthand); + +#ifdef CONFIG_SMP +static int apic_ipi_shorthand_off __ro_after_init; + +static __init int apic_ipi_shorthand(char *str) +{ + get_option(&str, &apic_ipi_shorthand_off); + return 1; +} +__setup("no_ipi_broadcast=", apic_ipi_shorthand); + +static int __init print_ipi_mode(void) +{ + pr_info("IPI shorthand broadcast: %s\n", + apic_ipi_shorthand_off ? "disabled" : "enabled"); + return 0; +} +late_initcall(print_ipi_mode); + +void apic_smt_update(void) +{ + /* + * Do not switch to broadcast mode if: + * - Disabled on the command line + * - Only a single CPU is online + * - Not all present CPUs have been at least booted once + * + * The latter is important as the local APIC might be in some + * random state and a broadcast might cause havoc. That's + * especially true for NMI broadcasting. + */ + if (apic_ipi_shorthand_off || num_online_cpus() == 1 || + !cpumask_equal(cpu_present_mask, &cpus_booted_once_mask)) { + static_branch_disable(&apic_use_ipi_shorthand); + } else { + static_branch_enable(&apic_use_ipi_shorthand); + } +} + +void apic_send_IPI_allbutself(unsigned int vector) +{ + if (num_online_cpus() < 2) + return; + + if (static_branch_likely(&apic_use_ipi_shorthand)) + apic->send_IPI_allbutself(vector); + else + apic->send_IPI_mask_allbutself(cpu_online_mask, vector); +} + +/* + * Send a 'reschedule' IPI to another CPU. It goes straight through and + * wastes no time serializing anything. Worst case is that we lose a + * reschedule ... + */ +void native_smp_send_reschedule(int cpu) +{ + if (unlikely(cpu_is_offline(cpu))) { + WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); + return; + } + apic->send_IPI(cpu, RESCHEDULE_VECTOR); +} + +void native_send_call_func_single_ipi(int cpu) +{ + apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR); +} + +void native_send_call_func_ipi(const struct cpumask *mask) +{ + if (static_branch_likely(&apic_use_ipi_shorthand)) { + unsigned int cpu = smp_processor_id(); + + if (!cpumask_or_equal(mask, cpumask_of(cpu), cpu_online_mask)) + goto sendmask; + + if (cpumask_test_cpu(cpu, mask)) + apic->send_IPI_all(CALL_FUNCTION_VECTOR); + else if (num_online_cpus() > 1) + apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR); + return; + } + +sendmask: + apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); +} + +#endif /* CONFIG_SMP */ + +static inline int __prepare_ICR2(unsigned int mask) +{ + return SET_APIC_DEST_FIELD(mask); +} + +static inline void __xapic_wait_icr_idle(void) +{ + while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +void __default_send_IPI_shortcut(unsigned int shortcut, int vector) { /* * Subtle. In the case of the 'never do double writes' workaround @@ -32,12 +121,16 @@ void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int /* * Wait for idle. */ - __xapic_wait_icr_idle(); + if (unlikely(vector == NMI_VECTOR)) + safe_apic_wait_icr_idle(); + else + __xapic_wait_icr_idle(); /* - * No need to touch the target chip field + * No need to touch the target chip field. Also the destination + * mode is ignored when a shorthand is used. */ - cfg = __prepare_ICR(shortcut, vector, dest); + cfg = __prepare_ICR(shortcut, vector, 0); /* * Send the IPI. The write to APIC_ICR fires this off. @@ -133,6 +226,21 @@ void default_send_IPI_single(int cpu, int vector) apic->send_IPI_mask(cpumask_of(cpu), vector); } +void default_send_IPI_allbutself(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector); +} + +void default_send_IPI_all(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector); +} + +void default_send_IPI_self(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_SELF, vector); +} + #ifdef CONFIG_X86_32 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, @@ -192,28 +300,6 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) local_irq_restore(flags); } -void default_send_IPI_allbutself(int vector) -{ - /* - * if there are no other CPUs in the system then we get an APIC send - * error if we try to broadcast, thus avoid sending IPIs in this case. - */ - if (!(num_online_cpus() > 1)) - return; - - __default_local_send_IPI_allbutself(vector); -} - -void default_send_IPI_all(int vector) -{ - __default_local_send_IPI_all(vector); -} - -void default_send_IPI_self(int vector) -{ - __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical); -} - /* must come after the send_IPI functions above for inlining */ static int convert_apicid_to_cpu(int apic_id) { diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h new file mode 100644 index 000000000000..04797f05ce94 --- /dev/null +++ b/arch/x86/kernel/apic/local.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Historical copyright notices: + * + * Copyright 2004 James Cleverdon, IBM. + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> + * (c) 2002,2003 Andi Kleen, SuSE Labs. + */ + +#include <linux/jump_label.h> + +#include <asm/apic.h> + +/* APIC flat 64 */ +void flat_init_apic_ldr(void); + +/* X2APIC */ +int x2apic_apic_id_valid(u32 apicid); +int x2apic_apic_id_registered(void); +void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); +unsigned int x2apic_get_apic_id(unsigned long id); +u32 x2apic_set_apic_id(unsigned int id); +int x2apic_phys_pkg_id(int initial_apicid, int index_msb); +void x2apic_send_IPI_self(int vector); +void __x2apic_send_IPI_shorthand(int vector, u32 which); + +/* IPI */ + +DECLARE_STATIC_KEY_FALSE(apic_use_ipi_shorthand); + +static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector, + unsigned int dest) +{ + unsigned int icr = shortcut | dest; + + switch (vector) { + default: + icr |= APIC_DM_FIXED | vector; + break; + case NMI_VECTOR: + icr |= APIC_DM_NMI; + break; + } + return icr; +} + +void __default_send_IPI_shortcut(unsigned int shortcut, int vector); + +/* + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). + */ +void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest); + +void default_send_IPI_single(int cpu, int vector); +void default_send_IPI_single_phys(int cpu, int vector); +void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector); +void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector); +void default_send_IPI_allbutself(int vector); +void default_send_IPI_all(int vector); +void default_send_IPI_self(int vector); + +#ifdef CONFIG_X86_32 +void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); +void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector); +void default_send_IPI_mask_logical(const struct cpumask *mask, int vector); +#endif diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 7f7533462474..159bd0cb8548 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -23,10 +23,8 @@ static struct irq_domain *msi_default_domain; -static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg) { - struct irq_cfg *cfg = irqd_cfg(data); - msg->address_hi = MSI_ADDR_BASE_HI; if (x2apic_enabled()) @@ -47,6 +45,127 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) MSI_DATA_VECTOR(cfg->vector); } +static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + __irq_msi_compose_msg(irqd_cfg(data), msg); +} + +static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg) +{ + struct msi_msg msg[2] = { [1] = { }, }; + + __irq_msi_compose_msg(cfg, msg); + irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg); +} + +static int +msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) +{ + struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd); + struct irq_data *parent = irqd->parent_data; + unsigned int cpu; + int ret; + + /* Save the current configuration */ + cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd)); + old_cfg = *cfg; + + /* Allocate a new target vector */ + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) + return ret; + + /* + * For non-maskable and non-remapped MSI interrupts the migration + * to a different destination CPU and a different vector has to be + * done careful to handle the possible stray interrupt which can be + * caused by the non-atomic update of the address/data pair. + * + * Direct update is possible when: + * - The MSI is maskable (remapped MSI does not use this code path)). + * The quirk bit is not set in this case. + * - The new vector is the same as the old vector + * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up) + * - The new destination CPU is the same as the old destination CPU + */ + if (!irqd_msi_nomask_quirk(irqd) || + cfg->vector == old_cfg.vector || + old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR || + cfg->dest_apicid == old_cfg.dest_apicid) { + irq_msi_update_msg(irqd, cfg); + return ret; + } + + /* + * Paranoia: Validate that the interrupt target is the local + * CPU. + */ + if (WARN_ON_ONCE(cpu != smp_processor_id())) { + irq_msi_update_msg(irqd, cfg); + return ret; + } + + /* + * Redirect the interrupt to the new vector on the current CPU + * first. This might cause a spurious interrupt on this vector if + * the device raises an interrupt right between this update and the + * update to the final destination CPU. + * + * If the vector is in use then the installed device handler will + * denote it as spurious which is no harm as this is a rare event + * and interrupt handlers have to cope with spurious interrupts + * anyway. If the vector is unused, then it is marked so it won't + * trigger the 'No irq handler for vector' warning in do_IRQ(). + * + * This requires to hold vector lock to prevent concurrent updates to + * the affected vector. + */ + lock_vector_lock(); + + /* + * Mark the new target vector on the local CPU if it is currently + * unused. Reuse the VECTOR_RETRIGGERED state which is also used in + * the CPU hotplug path for a similar purpose. This cannot be + * undone here as the current CPU has interrupts disabled and + * cannot handle the interrupt before the whole set_affinity() + * section is done. In the CPU unplug case, the current CPU is + * about to vanish and will not handle any interrupts anymore. The + * vector is cleaned up when the CPU comes online again. + */ + if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector]))) + this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED); + + /* Redirect it to the new vector on the local CPU temporarily */ + old_cfg.vector = cfg->vector; + irq_msi_update_msg(irqd, &old_cfg); + + /* Now transition it to the target CPU */ + irq_msi_update_msg(irqd, cfg); + + /* + * All interrupts after this point are now targeted at the new + * vector/CPU. + * + * Drop vector lock before testing whether the temporary assignment + * to the local CPU was hit by an interrupt raised in the device, + * because the retrigger function acquires vector lock again. + */ + unlock_vector_lock(); + + /* + * Check whether the transition raced with a device interrupt and + * is pending in the local APICs IRR. It is safe to do this outside + * of vector lock as the irq_desc::lock of this interrupt is still + * held and interrupts are disabled: The check is not accessing the + * underlying vector store. It's just checking the local APIC's + * IRR. + */ + if (lapic_vector_set_in_irr(cfg->vector)) + irq_data_get_irq_chip(irqd)->irq_retrigger(irqd); + + return ret; +} + /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, * which implement the MSI or MSI-X Capability Structure. @@ -58,6 +177,7 @@ static struct irq_chip pci_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_set_affinity = msi_set_affinity, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -146,6 +266,8 @@ void __init arch_init_msi_domain(struct irq_domain *parent) } if (!msi_default_domain) pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); + else + msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; } #ifdef CONFIG_IRQ_REMAP diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index ee2d91e382f1..67b33d67002f 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -6,51 +6,14 @@ * * Generic x86 APIC driver probe layer. */ -#include <linux/threads.h> -#include <linux/cpumask.h> #include <linux/export.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/init.h> #include <linux/errno.h> -#include <asm/fixmap.h> -#include <asm/mpspec.h> -#include <asm/apicdef.h> -#include <asm/apic.h> -#include <asm/setup.h> - #include <linux/smp.h> -#include <asm/ipi.h> -#include <linux/interrupt.h> +#include <asm/apic.h> #include <asm/acpi.h> -#include <asm/e820/api.h> -#ifdef CONFIG_HOTPLUG_CPU -#define DEFAULT_SEND_IPI (1) -#else -#define DEFAULT_SEND_IPI (0) -#endif - -int no_broadcast = DEFAULT_SEND_IPI; - -static __init int no_ipi_broadcast(char *str) -{ - get_option(&str, &no_broadcast); - pr_info("Using %s mode\n", - no_broadcast ? "No IPI Broadcast" : "IPI Broadcast"); - return 1; -} -__setup("no_ipi_broadcast=", no_ipi_broadcast); - -static int __init print_ipi_mode(void) -{ - pr_info("Using IPI %s mode\n", - no_broadcast ? "No-Shortcut" : "Shortcut"); - return 0; -} -late_initcall(print_ipi_mode); +#include "local.h" static int default_x86_32_early_logical_apicid(int cpu) { diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index e6560a02eb46..29f0e0984557 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -8,19 +8,9 @@ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/hardirq.h> -#include <linux/dmar.h> - -#include <asm/smp.h> #include <asm/apic.h> -#include <asm/ipi.h> -#include <asm/setup.h> + +#include "local.h" /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. @@ -46,13 +36,6 @@ void __init default_setup_apic_routing(void) x86_platform.apic_post_init(); } -/* Same for both flat and physical. */ - -void apic_send_IPI_self(int vector) -{ - __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); -} - int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { struct apic **drv; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index fdacb864c3dd..2c5676b0a6e7 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -398,6 +398,17 @@ static int activate_reserved(struct irq_data *irqd) if (!irqd_can_reserve(irqd)) apicd->can_reserve = false; } + + /* + * Check to ensure that the effective affinity mask is a subset + * the user supplied affinity mask, and warn the user if it is not + */ + if (!cpumask_subset(irq_data_get_effective_affinity_mask(irqd), + irq_data_get_affinity_mask(irqd))) { + pr_warn("irq %u: Affinity broken due to vector space exhaustion.\n", + irqd->irq); + } + return ret; } diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h deleted file mode 100644 index a49b3604027f..000000000000 --- a/arch/x86/kernel/apic/x2apic.h +++ /dev/null @@ -1,9 +0,0 @@ -/* Common bits for X2APIC cluster/physical modes. */ - -int x2apic_apic_id_valid(u32 apicid); -int x2apic_apic_id_registered(void); -void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); -unsigned int x2apic_get_apic_id(unsigned long id); -u32 x2apic_set_apic_id(unsigned int id); -int x2apic_phys_pkg_id(int initial_apicid, int index_msb); -void x2apic_send_IPI_self(int vector); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 609e499387a1..b0889c48a2ac 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -1,15 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/threads.h> + +#include <linux/cpuhotplug.h> #include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/dmar.h> -#include <linux/irq.h> -#include <linux/cpu.h> +#include <linux/slab.h> +#include <linux/mm.h> + +#include <asm/apic.h> -#include <asm/smp.h> -#include "x2apic.h" +#include "local.h" struct cluster_mask { unsigned int clusterid; @@ -84,12 +82,12 @@ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) static void x2apic_send_IPI_allbutself(int vector) { - __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT); + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_all(int vector) { - __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); } static u32 x2apic_calc_apicid(unsigned int cpu) @@ -158,7 +156,8 @@ static int x2apic_dead_cpu(unsigned int dead_cpu) { struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu); - cpumask_clear_cpu(dead_cpu, &cmsk->mask); + if (cmsk) + cpumask_clear_cpu(dead_cpu, &cmsk->mask); free_cpumask_var(per_cpu(ipi_mask, dead_cpu)); return 0; } diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index b5cf9e7b3830..bc9693841353 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -1,14 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/threads.h> + #include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/dmar.h> +#include <linux/acpi.h> -#include <asm/smp.h> -#include <asm/ipi.h> -#include "x2apic.h" +#include "local.h" int x2apic_phys; @@ -80,12 +75,12 @@ static void static void x2apic_send_IPI_allbutself(int vector) { - __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT); + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_all(int vector) { - __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); } static void init_x2apic_ldr(void) @@ -117,6 +112,14 @@ void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) native_x2apic_icr_write(cfg, apicid); } +void __x2apic_send_IPI_shorthand(int vector, u32 which) +{ + unsigned long cfg = __prepare_ICR(which, vector, 0); + + x2apic_wrmsr_fence(); + native_x2apic_icr_write(cfg, 0); +} + unsigned int x2apic_get_apic_id(unsigned long id) { return id; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 1e225528f0d7..ad53b2abc859 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -7,50 +7,37 @@ * * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ +#include <linux/crash_dump.h> +#include <linux/cpuhotplug.h> #include <linux/cpumask.h> -#include <linux/hardirq.h> #include <linux/proc_fs.h> -#include <linux/threads.h> -#include <linux/kernel.h> +#include <linux/memory.h> #include <linux/export.h> -#include <linux/string.h> -#include <linux/ctype.h> -#include <linux/sched.h> -#include <linux/timer.h> -#include <linux/slab.h> -#include <linux/cpu.h> -#include <linux/init.h> -#include <linux/io.h> #include <linux/pci.h> -#include <linux/kdebug.h> -#include <linux/delay.h> -#include <linux/crash_dump.h> -#include <linux/reboot.h> -#include <linux/memory.h> -#include <linux/numa.h> +#include <linux/acpi.h> +#include <linux/efi.h> +#include <asm/e820/api.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> -#include <asm/current.h> -#include <asm/pgtable.h> #include <asm/uv/bios.h> #include <asm/uv/uv.h> #include <asm/apic.h> -#include <asm/e820/api.h> -#include <asm/ipi.h> -#include <asm/smp.h> -#include <asm/x86_init.h> -#include <asm/nmi.h> -DEFINE_PER_CPU(int, x2apic_extra_bits); +static DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; -static bool uv_hubless_system; +static int uv_hubbed_system; +static int uv_hubless_system; static u64 gru_start_paddr, gru_end_paddr; static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; static u64 gru_dist_lmask, gru_dist_umask; static union uvh_apicid uvh_apicid; +/* Unpack OEM/TABLE ID's to be NULL terminated strings */ +static u8 oem_id[ACPI_OEM_ID_SIZE + 1]; +static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; + /* Information derived from CPUID: */ static struct { unsigned int apicid_shift; @@ -268,17 +255,35 @@ static void __init uv_set_apicid_hibit(void) } } -static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static void __init uv_stringify(int len, char *to, char *from) +{ + /* Relies on 'to' being NULL chars so result will be NULL terminated */ + strncpy(to, from, len-1); +} + +static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) { int pnodeid; int uv_apic; + uv_stringify(sizeof(oem_id), oem_id, _oem_id); + uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); + if (strncmp(oem_id, "SGI", 3) != 0) { - if (strncmp(oem_id, "NSGI", 4) == 0) { - uv_hubless_system = true; - pr_info("UV: OEM IDs %s/%s, HUBLESS\n", - oem_id, oem_table_id); - } + if (strncmp(oem_id, "NSGI", 4) != 0) + return 0; + + /* UV4 Hubless, CH, (0x11:UV4+Any) */ + if (strncmp(oem_id, "NSGI4", 5) == 0) + uv_hubless_system = 0x11; + + /* UV3 Hubless, UV300/MC990X w/o hub (0x9:UV3+Any) */ + else + uv_hubless_system = 0x9; + + pr_info("UV: OEM IDs %s/%s, HUBLESS(0x%x)\n", + oem_id, oem_table_id, uv_hubless_system); + return 0; } @@ -306,6 +311,24 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (uv_hub_info->hub_revision == 0) goto badbios; + switch (uv_hub_info->hub_revision) { + case UV4_HUB_REVISION_BASE: + uv_hubbed_system = 0x11; + break; + + case UV3_HUB_REVISION_BASE: + uv_hubbed_system = 0x9; + break; + + case UV2_HUB_REVISION_BASE: + uv_hubbed_system = 0x5; + break; + + case UV1_HUB_REVISION_BASE: + uv_hubbed_system = 0x3; + break; + } + pnodeid = early_get_pnodeid(); early_get_apic_socketid_shift(); @@ -356,9 +379,15 @@ int is_uv_system(void) } EXPORT_SYMBOL_GPL(is_uv_system); -int is_uv_hubless(void) +int is_uv_hubbed(int uvtype) +{ + return (uv_hubbed_system & uvtype); +} +EXPORT_SYMBOL_GPL(is_uv_hubbed); + +int is_uv_hubless(int uvtype) { - return uv_hubless_system; + return (uv_hubless_system & uvtype); } EXPORT_SYMBOL_GPL(is_uv_hubless); @@ -1275,7 +1304,8 @@ static int __init decode_uv_systab(void) struct uv_systab *st; int i; - if (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE) + /* If system is uv3 or lower, there is no extended UVsystab */ + if (is_uv_hubbed(0xfffffe) < uv(4) && is_uv_hubless(0xfffffe) < uv(4)) return 0; /* No extended UVsystab required */ st = uv_systab; @@ -1454,6 +1484,72 @@ static void __init build_socket_tables(void) } } +/* Check which reboot to use */ +static void check_efi_reboot(void) +{ + /* If EFI reboot not available, use ACPI reboot */ + if (!efi_enabled(EFI_BOOT)) + reboot_type = BOOT_ACPI; +} + +/* Setup user proc fs files */ +static int __maybe_unused proc_hubbed_show(struct seq_file *file, void *data) +{ + seq_printf(file, "0x%x\n", uv_hubbed_system); + return 0; +} + +static int __maybe_unused proc_hubless_show(struct seq_file *file, void *data) +{ + seq_printf(file, "0x%x\n", uv_hubless_system); + return 0; +} + +static int __maybe_unused proc_oemid_show(struct seq_file *file, void *data) +{ + seq_printf(file, "%s/%s\n", oem_id, oem_table_id); + return 0; +} + +static __init void uv_setup_proc_files(int hubless) +{ + struct proc_dir_entry *pde; + + pde = proc_mkdir(UV_PROC_NODE, NULL); + proc_create_single("oemid", 0, pde, proc_oemid_show); + if (hubless) + proc_create_single("hubless", 0, pde, proc_hubless_show); + else + proc_create_single("hubbed", 0, pde, proc_hubbed_show); +} + +/* Initialize UV hubless systems */ +static __init int uv_system_init_hubless(void) +{ + int rc; + + /* Setup PCH NMI handler */ + uv_nmi_setup_hubless(); + + /* Init kernel/BIOS interface */ + rc = uv_bios_init(); + if (rc < 0) + return rc; + + /* Process UVsystab */ + rc = decode_uv_systab(); + if (rc < 0) + return rc; + + /* Create user access node */ + if (rc >= 0) + uv_setup_proc_files(1); + + check_efi_reboot(); + + return rc; +} + static void __init uv_system_init_hub(void) { struct uv_hub_info_s hub_info = {0}; @@ -1579,32 +1675,27 @@ static void __init uv_system_init_hub(void) uv_nmi_setup(); uv_cpu_init(); uv_scir_register_cpu_notifier(); - proc_mkdir("sgi_uv", NULL); + uv_setup_proc_files(0); /* Register Legacy VGA I/O redirection handler: */ pci_register_set_vga_state(uv_set_vga_state); - /* - * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as - * EFI is not enabled in the kdump kernel: - */ - if (is_kdump_kernel()) - reboot_type = BOOT_ACPI; + check_efi_reboot(); } /* - * There is a small amount of UV specific code needed to initialize a - * UV system that does not have a "UV HUB" (referred to as "hubless"). + * There is a different code path needed to initialize a UV system that does + * not have a "UV HUB" (referred to as "hubless"). */ void __init uv_system_init(void) { - if (likely(!is_uv_system() && !is_uv_hubless())) + if (likely(!is_uv_system() && !is_uv_hubless(1))) return; if (is_uv_system()) uv_system_init_hub(); else - uv_nmi_setup_hubless(); + uv_system_init_hubless(); } apic_driver(apic_x2apic_uv_x); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index d3d075226c0a..24d2fde30d00 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -6,13 +6,28 @@ #include <asm/ia32.h> #define __SYSCALL_64(nr, sym, qual) [nr] = 1, +#define __SYSCALL_X32(nr, sym, qual) static char syscalls_64[] = { #include <asm/syscalls_64.h> }; +#undef __SYSCALL_64 +#undef __SYSCALL_X32 + +#ifdef CONFIG_X86_X32_ABI +#define __SYSCALL_64(nr, sym, qual) +#define __SYSCALL_X32(nr, sym, qual) [nr] = 1, +static char syscalls_x32[] = { +#include <asm/syscalls_64.h> +}; +#undef __SYSCALL_64 +#undef __SYSCALL_X32 +#endif + #define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls_ia32[] = { #include <asm/syscalls_32.h> }; +#undef __SYSCALL_I386 #if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) #include <asm/kvm_para.h> @@ -62,7 +77,6 @@ int main(void) ENTRY(cr2); ENTRY(cr3); ENTRY(cr4); - ENTRY(cr8); ENTRY(gdt_desc); BLANK(); #undef ENTRY @@ -80,6 +94,11 @@ int main(void) DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); +#ifdef CONFIG_X86_X32_ABI + DEFINE(__NR_syscall_x32_max, sizeof(syscalls_x32) - 1); + DEFINE(X32_NR_syscalls, sizeof(syscalls_x32)); +#endif + DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1); DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index d7a1e5a9331c..7dc4ad68eb41 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -29,8 +29,9 @@ obj-y += umwait.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o +obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o ifdef CONFIG_CPU_SUP_INTEL -obj-y += intel.o intel_pconfig.o +obj-y += intel.o intel_pconfig.o tsx.o obj-$(CONFIG_PM) += intel_epb.o endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o @@ -53,11 +54,12 @@ obj-$(CONFIG_ACRN_GUEST) += acrn.o ifdef CONFIG_X86_FEATURE_NAMES quiet_cmd_mkcapflags = MKCAP $@ - cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@ + cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^ cpufeature = $(src)/../../include/asm/cpufeatures.h +vmxfeature = $(src)/../../include/asm/vmxfeatures.h -$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE +$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/mkcapflags.sh FORCE $(call if_changed,mkcapflags) endif targets += capflags.c diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 8d4e50428b68..ac83a0fef628 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/random.h> +#include <linux/topology.h> #include <asm/processor.h> #include <asm/apic.h> #include <asm/cacheinfo.h> @@ -318,13 +319,6 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c) c->cpu_core_id %= cus_per_node; } - -static void amd_get_topology_early(struct cpuinfo_x86 *c) -{ - if (cpu_has(c, X86_FEATURE_TOPOEXT)) - smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; -} - /* * Fixup core topology information for * (1) AMD multi-node processors @@ -614,9 +608,9 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) return; clear_all: - clear_cpu_cap(c, X86_FEATURE_SME); + setup_clear_cpu_cap(X86_FEATURE_SME); clear_sev: - clear_cpu_cap(c, X86_FEATURE_SEV); + setup_clear_cpu_cap(X86_FEATURE_SEV); } } @@ -716,7 +710,8 @@ static void early_init_amd(struct cpuinfo_x86 *c) } } - amd_get_topology_early(c); + if (cpu_has(c, X86_FEATURE_TOPOEXT)) + smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; } static void init_amd_k8(struct cpuinfo_x86 *c) @@ -804,6 +799,64 @@ static void init_amd_ln(struct cpuinfo_x86 *c) msr_set_bit(MSR_AMD64_DE_CFG, 31); } +static bool rdrand_force; + +static int __init rdrand_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "force")) + rdrand_force = true; + else + return -EINVAL; + + return 0; +} +early_param("rdrand", rdrand_cmdline); + +static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c) +{ + /* + * Saving of the MSR used to hide the RDRAND support during + * suspend/resume is done by arch/x86/power/cpu.c, which is + * dependent on CONFIG_PM_SLEEP. + */ + if (!IS_ENABLED(CONFIG_PM_SLEEP)) + return; + + /* + * The nordrand option can clear X86_FEATURE_RDRAND, so check for + * RDRAND support using the CPUID function directly. + */ + if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force) + return; + + msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62); + + /* + * Verify that the CPUID change has occurred in case the kernel is + * running virtualized and the hypervisor doesn't support the MSR. + */ + if (cpuid_ecx(1) & BIT(30)) { + pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n"); + return; + } + + clear_cpu_cap(c, X86_FEATURE_RDRAND); + pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n"); +} + +static void init_amd_jg(struct cpuinfo_x86 *c) +{ + /* + * Some BIOS implementations do not restore proper RDRAND support + * across suspend and resume. Check on whether to hide the RDRAND + * instruction support via CPUID. + */ + clear_rdrand_cpuid_bit(c); +} + static void init_amd_bd(struct cpuinfo_x86 *c) { u64 value; @@ -818,12 +871,23 @@ static void init_amd_bd(struct cpuinfo_x86 *c) wrmsrl_safe(MSR_F15H_IC_CFG, value); } } + + /* + * Some BIOS implementations do not restore proper RDRAND support + * across suspend and resume. Check on whether to hide the RDRAND + * instruction support via CPUID. + */ + clear_rdrand_cpuid_bit(c); } static void init_amd_zn(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_ZEN); +#ifdef CONFIG_NUMA + node_reclaim_distance = 32; +#endif + /* * Fix erratum 1076: CPB feature bit not being set in CPUID. * Always set it, except when running under a hypervisor. @@ -860,6 +924,7 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x10: init_amd_gh(c); break; case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; + case 0x16: init_amd_jg(c); break; case 0x17: init_amd_zn(c); break; } @@ -879,12 +944,8 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); if (cpu_has(c, X86_FEATURE_XMM2)) { - unsigned long long val; - int ret; - /* - * A serializing LFENCE has less overhead than MFENCE, so - * use it for execution serialization. On families which + * Use LFENCE for execution serialization. On families which * don't have that MSR, LFENCE is already serializing. * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. @@ -892,19 +953,8 @@ static void init_amd(struct cpuinfo_x86 *c) msr_set_bit(MSR_F10H_DECFG, MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); - /* - * Verify that the MSR write was successful (could be running - * under a hypervisor) and only then assume that LFENCE is - * serializing. - */ - ret = rdmsrl_safe(MSR_F10H_DECFG, &val); - if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { - /* A serializing LFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - } else { - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); - } + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); } /* diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c6fa3ef10b4e..ed54b3b21c39 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -39,6 +39,8 @@ static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); +static void __init mds_print_mitigation(void); +static void __init taa_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -105,6 +107,13 @@ void __init check_bugs(void) ssb_select_mitigation(); l1tf_select_mitigation(); mds_select_mitigation(); + taa_select_mitigation(); + + /* + * As MDS and TAA mitigations are inter-related, print MDS + * mitigation until after TAA mitigation selection is done. + */ + mds_print_mitigation(); arch_smt_update(); @@ -243,6 +252,12 @@ static void __init mds_select_mitigation(void) (mds_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); } +} + +static void __init mds_print_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) + return; pr_info("%s\n", mds_strings[mds_mitigation]); } @@ -269,6 +284,120 @@ static int __init mds_cmdline(char *str) early_param("mds", mds_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "TAA: " fmt + +enum taa_mitigations { + TAA_MITIGATION_OFF, + TAA_MITIGATION_UCODE_NEEDED, + TAA_MITIGATION_VERW, + TAA_MITIGATION_TSX_DISABLED, +}; + +/* Default mitigation for TAA-affected CPUs */ +static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; +static bool taa_nosmt __ro_after_init; + +static const char * const taa_strings[] = { + [TAA_MITIGATION_OFF] = "Vulnerable", + [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", + [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", +}; + +static void __init taa_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* TSX previously disabled by tsx=off */ + if (!boot_cpu_has(X86_FEATURE_RTM)) { + taa_mitigation = TAA_MITIGATION_TSX_DISABLED; + goto out; + } + + if (cpu_mitigations_off()) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* + * TAA mitigation via VERW is turned off if both + * tsx_async_abort=off and mds=off are specified. + */ + if (taa_mitigation == TAA_MITIGATION_OFF && + mds_mitigation == MDS_MITIGATION_OFF) + goto out; + + if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_VERW; + else + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ( (ia32_cap & ARCH_CAP_MDS_NO) && + !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + static_branch_enable(&mds_user_clear); + + if (taa_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); + + /* + * Update MDS mitigation, if necessary, as the mds_user_clear is + * now enabled for TAA mitigation. + */ + if (mds_mitigation == MDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MDS)) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_select_mitigation(); + } +out: + pr_info("%s\n", taa_strings[taa_mitigation]); +} + +static int __init tsx_async_abort_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_TAA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + taa_mitigation = TAA_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + taa_mitigation = TAA_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_nosmt = true; + } + + return 0; +} +early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt enum spectre_v1_mitigation { @@ -786,13 +915,10 @@ static void update_mds_branch_idle(void) } #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" -void arch_smt_update(void) +void cpu_bugs_smt_update(void) { - /* Enhanced IBRS implies STIBP. No update required. */ - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) - return; - mutex_lock(&spec_ctrl_mutex); switch (spectre_v2_user) { @@ -819,6 +945,17 @@ void arch_smt_update(void) break; } + switch (taa_mitigation) { + case TAA_MITIGATION_VERW: + case TAA_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(TAA_MSG_SMT); + break; + case TAA_MITIGATION_TSX_DISABLED: + case TAA_MITIGATION_OFF: + break; + } + mutex_unlock(&spec_ctrl_mutex); } @@ -1149,6 +1286,9 @@ void x86_spec_ctrl_setup_ap(void) x86_amd_ssb_disable(); } +bool itlb_multihit_kvm_mitigation; +EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); + #undef pr_fmt #define pr_fmt(fmt) "L1TF: " fmt @@ -1184,15 +1324,15 @@ static void override_cache_bits(struct cpuinfo_x86 *c) case INTEL_FAM6_WESTMERE: case INTEL_FAM6_SANDYBRIDGE: case INTEL_FAM6_IVYBRIDGE: - case INTEL_FAM6_HASWELL_CORE: - case INTEL_FAM6_HASWELL_ULT: - case INTEL_FAM6_HASWELL_GT3E: - case INTEL_FAM6_BROADWELL_CORE: - case INTEL_FAM6_BROADWELL_GT3E: - case INTEL_FAM6_SKYLAKE_MOBILE: - case INTEL_FAM6_SKYLAKE_DESKTOP: - case INTEL_FAM6_KABYLAKE_MOBILE: - case INTEL_FAM6_KABYLAKE_DESKTOP: + case INTEL_FAM6_HASWELL: + case INTEL_FAM6_HASWELL_L: + case INTEL_FAM6_HASWELL_G: + case INTEL_FAM6_BROADWELL: + case INTEL_FAM6_BROADWELL_G: + case INTEL_FAM6_SKYLAKE_L: + case INTEL_FAM6_SKYLAKE: + case INTEL_FAM6_KABYLAKE_L: + case INTEL_FAM6_KABYLAKE: if (c->x86_cache_bits < 44) c->x86_cache_bits = 44; break; @@ -1304,11 +1444,24 @@ static ssize_t l1tf_show_state(char *buf) l1tf_vmx_states[l1tf_vmx_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } + +static ssize_t itlb_multihit_show_state(char *buf) +{ + if (itlb_multihit_kvm_mitigation) + return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); + else + return sprintf(buf, "KVM: Vulnerable\n"); +} #else static ssize_t l1tf_show_state(char *buf) { return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); } + +static ssize_t itlb_multihit_show_state(char *buf) +{ + return sprintf(buf, "Processor vulnerable\n"); +} #endif static ssize_t mds_show_state(char *buf) @@ -1328,6 +1481,21 @@ static ssize_t mds_show_state(char *buf) sched_smt_active() ? "vulnerable" : "disabled"); } +static ssize_t tsx_async_abort_show_state(char *buf) +{ + if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || + (taa_mitigation == TAA_MITIGATION_OFF)) + return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sprintf(buf, "%s; SMT Host state unknown\n", + taa_strings[taa_mitigation]); + } + + return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + static char *stibp_state(void) { if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) @@ -1398,6 +1566,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_MDS: return mds_show_state(buf); + case X86_BUG_TAA: + return tsx_async_abort_show_state(buf); + + case X86_BUG_ITLB_MULTIHIT: + return itlb_multihit_show_state(buf); + default: break; } @@ -1434,4 +1608,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu { return cpu_show_common(dev, attr, buf, X86_BUG_MDS); } + +ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_TAA); +} + +ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); +} #endif diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 14433ff5b828..426792565d86 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -18,13 +18,6 @@ #define RNG_ENABLED (1 << 3) #define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */ -#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 -#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000 -#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000 -#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 -#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 -#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 - static void init_c3(struct cpuinfo_x86 *c) { u32 lo, hi; @@ -71,8 +64,6 @@ static void init_c3(struct cpuinfo_x86 *c) c->x86_cache_alignment = c->x86_clflush_size * 2; set_cpu_cap(c, X86_FEATURE_REP_GOOD); } - - cpu_detect_cache_sizes(c); } enum { @@ -119,31 +110,6 @@ static void early_init_centaur(struct cpuinfo_x86 *c) } } -static void centaur_detect_vmx_virtcap(struct cpuinfo_x86 *c) -{ - u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; - - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); - msr_ctl = vmx_msr_high | vmx_msr_low; - - if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW) - set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); - if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI) - set_cpu_cap(c, X86_FEATURE_VNMI); - if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) { - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, - vmx_msr_low, vmx_msr_high); - msr_ctl2 = vmx_msr_high | vmx_msr_low; - if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && - (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) - set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); - if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) - set_cpu_cap(c, X86_FEATURE_EPT); - if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) - set_cpu_cap(c, X86_FEATURE_VPID); - } -} - static void init_centaur(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 @@ -250,8 +216,7 @@ static void init_centaur(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); #endif - if (cpu_has(c, X86_FEATURE_VMX)) - centaur_detect_vmx_virtcap(c); + init_ia32_feat_ctl(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f125bf7ecb6f..52c9bfbbdb2a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -14,6 +14,7 @@ #include <linux/sched/mm.h> #include <linux/sched/clock.h> #include <linux/sched/task.h> +#include <linux/sched/smt.h> #include <linux/init.h> #include <linux/kprobes.h> #include <linux/kgdb.h> @@ -24,6 +25,7 @@ #include <asm/stackprotector.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> +#include <asm/doublefault.h> #include <asm/archrandom.h> #include <asm/hypervisor.h> #include <asm/processor.h> @@ -48,15 +50,12 @@ #include <asm/cpu.h> #include <asm/mce.h> #include <asm/msr.h> -#include <asm/pat.h> +#include <asm/memtype.h> #include <asm/microcode.h> #include <asm/microcode_intel.h> #include <asm/intel-family.h> #include <asm/cpu_device_id.h> - -#ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> -#endif #include "cpu.h" @@ -165,22 +164,6 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); -static int __init x86_mpx_setup(char *s) -{ - /* require an exact match without trailing characters */ - if (strlen(s)) - return 0; - - /* do not emit a message if the feature is not present */ - if (!boot_cpu_has(X86_FEATURE_MPX)) - return 1; - - setup_clear_cpu_cap(X86_FEATURE_MPX); - pr_info("nompx: Intel Memory Protection Extensions (MPX) disabled\n"); - return 1; -} -__setup("nompx", x86_mpx_setup); - #ifdef CONFIG_X86_64 static int __init x86_nopcid_setup(char *s) { @@ -307,8 +290,6 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) static __init int setup_disable_smep(char *arg) { setup_clear_cpu_cap(X86_FEATURE_SMEP); - /* Check for things that depend on SMEP being enabled: */ - check_mpx_erratum(&boot_cpu_data); return 1; } __setup("nosmep", setup_disable_smep); @@ -565,8 +546,9 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; -__u32 cpu_caps_set[NCAPINTS + NBUGINTS]; +/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */ +__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); +__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); void load_percpu_segment(int cpu) { @@ -1016,13 +998,15 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #endif } -#define NO_SPECULATION BIT(0) -#define NO_MELTDOWN BIT(1) -#define NO_SSB BIT(2) -#define NO_L1TF BIT(3) -#define NO_MDS BIT(4) -#define MSBDS_ONLY BIT(5) -#define NO_SWAPGS BIT(6) +#define NO_SPECULATION BIT(0) +#define NO_MELTDOWN BIT(1) +#define NO_SSB BIT(2) +#define NO_L1TF BIT(3) +#define NO_MDS BIT(4) +#define MSBDS_ONLY BIT(5) +#define NO_SWAPGS BIT(6) +#define NO_ITLB_MULTIHIT BIT(7) +#define NO_SPECTRE_V2 BIT(8) #define VULNWL(_vendor, _family, _model, _whitelist) \ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } @@ -1043,26 +1027,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ - VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), - - VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(CORE_YONAH, NO_SSB), - VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1072,15 +1057,21 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ + VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT), + /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + + /* Zhaoxin Family 7 */ + VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS), + VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS), {} }; @@ -1091,18 +1082,31 @@ static bool __init cpu_matches(unsigned long which) return m && !!(m->driver_data & which); } -static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +u64 x86_read_arch_cap_msr(void) { u64 ia32_cap = 0; + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + + return ia32_cap; +} + +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ + if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); + if (cpu_matches(NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + if (!cpu_matches(NO_SPECTRE_V2)) + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) @@ -1120,6 +1124,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (!cpu_matches(NO_SWAPGS)) setup_force_cpu_bug(X86_BUG_SWAPGS); + /* + * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when: + * - TSX is supported or + * - TSX_CTRL is present + * + * TSX_CTRL check is needed for cases when TSX could be disabled before + * the kernel boot e.g. kexec. + * TSX_CTRL check alone is not sufficient for cases when the microcode + * update is not present or running as guest that don't get TSX_CTRL. + */ + if (!(ia32_cap & ARCH_CAP_TAA_NO) && + (cpu_has(c, X86_FEATURE_RTM) || + (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + setup_force_cpu_bug(X86_BUG_TAA); + if (cpu_matches(NO_MELTDOWN)) return; @@ -1420,6 +1439,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) #endif c->x86_cache_alignment = c->x86_clflush_size; memset(&c->x86_capability, 0, sizeof(c->x86_capability)); +#ifdef CONFIG_X86_VMX_FEATURE_NAMES + memset(&c->vmx_capability, 0, sizeof(c->vmx_capability)); +#endif generic_identify(c); @@ -1553,6 +1575,8 @@ void __init identify_boot_cpu(void) #endif cpu_detect_tlb(&boot_cpu_data); setup_cr_pinning(); + + tsx_init(); } void identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -1748,7 +1772,7 @@ static void wait_for_master_cpu(int cpu) } #ifdef CONFIG_X86_64 -static void setup_getcpu(int cpu) +static inline void setup_getcpu(int cpu) { unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); struct desc_struct d = { }; @@ -1768,7 +1792,50 @@ static void setup_getcpu(int cpu) write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S); } + +static inline void ucode_cpu_init(int cpu) +{ + if (cpu) + load_ucode_ap(); +} + +static inline void tss_setup_ist(struct tss_struct *tss) +{ + /* Set up the per-CPU TSS IST stacks */ + tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); + tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); + tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); + tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); +} + +#else /* CONFIG_X86_64 */ + +static inline void setup_getcpu(int cpu) { } + +static inline void ucode_cpu_init(int cpu) +{ + show_ucode_info_early(); +} + +static inline void tss_setup_ist(struct tss_struct *tss) { } + +#endif /* !CONFIG_X86_64 */ + +static inline void tss_setup_io_bitmap(struct tss_struct *tss) +{ + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID; + +#ifdef CONFIG_X86_IOPL_IOPERM + tss->io_bitmap.prev_max = 0; + tss->io_bitmap.prev_sequence = 0; + memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap)); + /* + * Invalidate the extra array entry past the end of the all + * permission bitmap as required by the hardware. + */ + tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL; #endif +} /* * cpu_init() initializes state that is per-CPU. Some data is already @@ -1776,21 +1843,15 @@ static void setup_getcpu(int cpu) * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. */ -#ifdef CONFIG_X86_64 - void cpu_init(void) { + struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + struct task_struct *cur = current; int cpu = raw_smp_processor_id(); - struct task_struct *me; - struct tss_struct *t; - int i; wait_for_master_cpu(cpu); - if (cpu) - load_ucode_ap(); - - t = &per_cpu(cpu_tss_rw, cpu); + ucode_cpu_init(cpu); #ifdef CONFIG_NUMA if (this_cpu_read(numa_node) == 0 && @@ -1799,63 +1860,47 @@ void cpu_init(void) #endif setup_getcpu(cpu); - me = current; - pr_debug("Initializing CPU#%d\n", cpu); - cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) || + boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE)) + cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); /* * Initialize the per-CPU GDT with the boot GDT, * and set up the GDT descriptor: */ - switch_to_new_gdt(cpu); - loadsegment(fs, 0); - load_current_idt(); - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); - syscall_init(); - - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); - barrier(); + if (IS_ENABLED(CONFIG_X86_64)) { + loadsegment(fs, 0); + memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); - x86_configure_nx(); - x2apic_setup(); + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); - /* - * set up and load the per-CPU TSS - */ - if (!t->x86_tss.ist[0]) { - t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); - t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); - t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); - t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); + x2apic_setup(); } - t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - - /* - * <= is required because the CPU will access up to - * 8 bits beyond the end of the IO permission bitmap. - */ - for (i = 0; i <= IO_BITMAP_LONGS; i++) - t->io_bitmap[i] = ~0UL; - mmgrab(&init_mm); - me->active_mm = &init_mm; - BUG_ON(me->mm); + cur->active_mm = &init_mm; + BUG_ON(cur->mm); initialize_tlbstate_and_flush(); - enter_lazy_tlb(&init_mm, me); + enter_lazy_tlb(&init_mm, cur); - /* - * Initialize the TSS. sp0 points to the entry trampoline stack - * regardless of what task is running. - */ + /* Initialize the TSS. */ + tss_setup_ist(tss); + tss_setup_io_bitmap(tss); set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + load_TR_desc(); + /* + * sp0 points to the entry trampoline stack regardless of what task + * is running. + */ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); load_mm_ldt(&init_mm); @@ -1863,6 +1908,8 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); + doublefault_init_cpu_tss(); + fpu__init_cpu(); if (is_uv_system()) @@ -1871,63 +1918,6 @@ void cpu_init(void) load_fixmap_gdt(cpu); } -#else - -void cpu_init(void) -{ - int cpu = smp_processor_id(); - struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu); - - wait_for_master_cpu(cpu); - - show_ucode_info_early(); - - pr_info("Initializing CPU#%d\n", cpu); - - if (cpu_feature_enabled(X86_FEATURE_VME) || - boot_cpu_has(X86_FEATURE_TSC) || - boot_cpu_has(X86_FEATURE_DE)) - cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - - load_current_idt(); - switch_to_new_gdt(cpu); - - /* - * Set up and load the per-CPU TSS and LDT - */ - mmgrab(&init_mm); - curr->active_mm = &init_mm; - BUG_ON(curr->mm); - initialize_tlbstate_and_flush(); - enter_lazy_tlb(&init_mm, curr); - - /* - * Initialize the TSS. sp0 points to the entry trampoline stack - * regardless of what task is running. - */ - set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); - load_TR_desc(); - load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); - - load_mm_ldt(&init_mm); - - t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - -#ifdef CONFIG_DOUBLEFAULT - /* Set up doublefault TSS pointer in the GDT */ - __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); -#endif - - clear_all_debug_regs(); - dbg_restore_debug_regs(); - - fpu__init_cpu(); - - load_fixmap_gdt(cpu); -} -#endif - /* * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU @@ -1957,3 +1947,14 @@ void microcode_check(void) pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n"); pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); } + +/* + * Invoked from core CPU hotplug code after hotplug operations + */ +void arch_smt_update(void) +{ + /* Handle the speculative execution misfeatures */ + cpu_bugs_smt_update(); + /* Check whether IPI broadcasting can be enabled */ + apic_smt_update(); +} diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index c0e2407abdd6..37fdefd14f28 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -44,6 +44,22 @@ struct _tlb_table { extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; +#ifdef CONFIG_CPU_SUP_INTEL +enum tsx_ctrl_states { + TSX_CTRL_ENABLE, + TSX_CTRL_DISABLE, + TSX_CTRL_NOT_SUPPORTED, +}; + +extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; + +extern void __init tsx_init(void); +extern void tsx_enable(void); +extern void tsx_disable(void); +#else +static inline void tsx_init(void) { } +#endif /* CONFIG_CPU_SUP_INTEL */ + extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); @@ -62,4 +78,10 @@ unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); +extern u64 x86_read_arch_cap_msr(void); + +#ifdef CONFIG_IA32_FEAT_CTL +void init_ia32_feat_ctl(struct cpuinfo_x86 *c); +#endif + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index b5353244749b..3cbe24ca80ab 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -20,54 +20,55 @@ struct cpuid_dep { * but it's difficult to tell that to the init reference checker. */ static const struct cpuid_dep cpuid_deps[] = { - { X86_FEATURE_FXSR, X86_FEATURE_FPU }, - { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, - { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, - { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, - { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, - { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, - { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, - { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, - { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, - { X86_FEATURE_MMX, X86_FEATURE_FXSR }, - { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, - { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, - { X86_FEATURE_XSAVE, X86_FEATURE_FXSR }, - { X86_FEATURE_XMM, X86_FEATURE_FXSR }, - { X86_FEATURE_XMM2, X86_FEATURE_XMM }, - { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, - { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, - { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, - { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, - { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, - { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, - { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, - { X86_FEATURE_AES, X86_FEATURE_XMM2 }, - { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, - { X86_FEATURE_FMA, X86_FEATURE_AVX }, - { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, - { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, - { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, - { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, - { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, - { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, - { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, - { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, - { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, - { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, - { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, + { X86_FEATURE_FXSR, X86_FEATURE_FPU }, + { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, + { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, + { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, + { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, + { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, + { X86_FEATURE_MMX, X86_FEATURE_FXSR }, + { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, + { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, + { X86_FEATURE_XSAVE, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM2, X86_FEATURE_XMM }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, + { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, + { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, + { X86_FEATURE_AES, X86_FEATURE_XMM2 }, + { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, + { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VP2INTERSECT, X86_FEATURE_AVX512VL }, + { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, {} }; diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c new file mode 100644 index 000000000000..0268185bef94 --- /dev/null +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/tboot.h> + +#include <asm/cpufeature.h> +#include <asm/msr-index.h> +#include <asm/processor.h> +#include <asm/vmx.h> + +#undef pr_fmt +#define pr_fmt(fmt) "x86/cpu: " fmt + +#ifdef CONFIG_X86_VMX_FEATURE_NAMES +enum vmx_feature_leafs { + MISC_FEATURES = 0, + PRIMARY_CTLS, + SECONDARY_CTLS, + NR_VMX_FEATURE_WORDS, +}; + +#define VMX_F(x) BIT(VMX_FEATURE_##x & 0x1f) + +static void init_vmx_capabilities(struct cpuinfo_x86 *c) +{ + u32 supported, funcs, ept, vpid, ign; + + BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS); + + /* + * The high bits contain the allowed-1 settings, i.e. features that can + * be turned on. The low bits contain the allowed-0 settings, i.e. + * features that can be turned off. Ignore the allowed-0 settings, + * if a feature can be turned on then it's supported. + * + * Use raw rdmsr() for primary processor controls and pin controls MSRs + * as they exist on any CPU that supports VMX, i.e. we want the WARN if + * the RDMSR faults. + */ + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, ign, supported); + c->vmx_capability[PRIMARY_CTLS] = supported; + + rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported); + c->vmx_capability[SECONDARY_CTLS] = supported; + + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported); + rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs); + + /* + * Except for EPT+VPID, which enumerates support for both in a single + * MSR, low for EPT, high for VPID. + */ + rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, &ept, &vpid); + + /* Pin, EPT, VPID and VM-Func are merged into a single word. */ + WARN_ON_ONCE(supported >> 16); + WARN_ON_ONCE(funcs >> 4); + c->vmx_capability[MISC_FEATURES] = (supported & 0xffff) | + ((vpid & 0x1) << 16) | + ((funcs & 0xf) << 28); + + /* EPT bits are full on scattered and must be manually handled. */ + if (ept & VMX_EPT_EXECUTE_ONLY_BIT) + c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_EXECUTE_ONLY); + if (ept & VMX_EPT_AD_BIT) + c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD); + if (ept & VMX_EPT_1GB_PAGE_BIT) + c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB); + + /* Synthetic APIC features that are aggregates of multiple features. */ + if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) && + (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_APIC_ACCESSES))) + c->vmx_capability[MISC_FEATURES] |= VMX_F(FLEXPRIORITY); + + if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) && + (c->vmx_capability[SECONDARY_CTLS] & VMX_F(APIC_REGISTER_VIRT)) && + (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_INTR_DELIVERY)) && + (c->vmx_capability[MISC_FEATURES] & VMX_F(POSTED_INTR))) + c->vmx_capability[MISC_FEATURES] |= VMX_F(APICV); + + /* Set the synthetic cpufeatures to preserve /proc/cpuinfo's ABI. */ + if (c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) + set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + if (c->vmx_capability[MISC_FEATURES] & VMX_F(FLEXPRIORITY)) + set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + if (c->vmx_capability[MISC_FEATURES] & VMX_F(VIRTUAL_NMIS)) + set_cpu_cap(c, X86_FEATURE_VNMI); + if (c->vmx_capability[SECONDARY_CTLS] & VMX_F(EPT)) + set_cpu_cap(c, X86_FEATURE_EPT); + if (c->vmx_capability[MISC_FEATURES] & VMX_F(EPT_AD)) + set_cpu_cap(c, X86_FEATURE_EPT_AD); + if (c->vmx_capability[MISC_FEATURES] & VMX_F(VPID)) + set_cpu_cap(c, X86_FEATURE_VPID); +} +#endif /* CONFIG_X86_VMX_FEATURE_NAMES */ + +void init_ia32_feat_ctl(struct cpuinfo_x86 *c) +{ + bool tboot = tboot_enabled(); + u64 msr; + + if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { + clear_cpu_cap(c, X86_FEATURE_VMX); + return; + } + + if (msr & FEAT_CTL_LOCKED) + goto update_caps; + + /* + * Ignore whatever value BIOS left in the MSR to avoid enabling random + * features or faulting on the WRMSR. + */ + msr = FEAT_CTL_LOCKED; + + /* + * Enable VMX if and only if the kernel may do VMXON at some point, + * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector + * for the kernel, e.g. using VMX to hide malicious code. + */ + if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) { + msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; + + if (tboot) + msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX; + } + + wrmsrl(MSR_IA32_FEAT_CTL, msr); + +update_caps: + set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL); + + if (!cpu_has(c, X86_FEATURE_VMX)) + return; + + if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) || + (!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) { + if (IS_ENABLED(CONFIG_KVM_INTEL)) + pr_err_once("VMX (%s TXT) disabled by BIOS\n", + tboot ? "inside" : "outside"); + clear_cpu_cap(c, X86_FEATURE_VMX); + } else { +#ifdef CONFIG_X86_VMX_FEATURE_NAMES + init_vmx_capabilities(c); +#endif + } +} diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 415621ddb8a2..4e28c1fc8749 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -330,12 +330,8 @@ static void init_hygon(struct cpuinfo_x86 *c) init_hygon_cacheinfo(c); if (cpu_has(c, X86_FEATURE_XMM2)) { - unsigned long long val; - int ret; - /* - * A serializing LFENCE has less overhead than MFENCE, so - * use it for execution serialization. On families which + * Use LFENCE for execution serialization. On families which * don't have that MSR, LFENCE is already serializing. * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. @@ -343,19 +339,8 @@ static void init_hygon(struct cpuinfo_x86 *c) msr_set_bit(MSR_F10H_DECFG, MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); - /* - * Verify that the MSR write was successful (could be running - * under a hypervisor) and only then assume that LFENCE is - * serializing. - */ - ret = rdmsrl_safe(MSR_F10H_DECFG, &val); - if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { - /* A serializing LFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - } else { - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); - } + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); } /* diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 8d6d92ebeb54..be82cd5841c3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -32,41 +32,6 @@ #endif /* - * Just in case our CPU detection goes bad, or you have a weird system, - * allow a way to override the automatic disabling of MPX. - */ -static int forcempx; - -static int __init forcempx_setup(char *__unused) -{ - forcempx = 1; - - return 1; -} -__setup("intel-skd-046-workaround=disable", forcempx_setup); - -void check_mpx_erratum(struct cpuinfo_x86 *c) -{ - if (forcempx) - return; - /* - * Turn off the MPX feature on CPUs where SMEP is not - * available or disabled. - * - * Works around Intel Erratum SKD046: "Branch Instructions - * May Initialize MPX Bound Registers Incorrectly". - * - * This might falsely disable MPX on systems without - * SMEP, like Atom processors without SMEP. But there - * is no such hardware known at the moment. - */ - if (cpu_has(c, X86_FEATURE_MPX) && !cpu_has(c, X86_FEATURE_SMEP)) { - setup_clear_cpu_cap(X86_FEATURE_MPX); - pr_warn("x86/mpx: Disabling MPX since SMEP not present\n"); - } -} - -/* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists * CPU models in which having conflicting memory types still leads to @@ -142,21 +107,21 @@ struct sku_microcode { u32 microcode; }; static const struct sku_microcode spectre_bad_microcodes[] = { - { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x80 }, - { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x80 }, - { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x80 }, - { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x80 }, - { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x80 }, + { INTEL_FAM6_KABYLAKE, 0x0B, 0x80 }, + { INTEL_FAM6_KABYLAKE, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE, 0x09, 0x80 }, + { INTEL_FAM6_KABYLAKE_L, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE_L, 0x09, 0x80 }, { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, - { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 }, - { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b }, - { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 }, - { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 }, + { INTEL_FAM6_BROADWELL, 0x04, 0x28 }, + { INTEL_FAM6_BROADWELL_G, 0x01, 0x1b }, + { INTEL_FAM6_BROADWELL_D, 0x02, 0x14 }, + { INTEL_FAM6_BROADWELL_D, 0x03, 0x07000011 }, { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 }, - { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 }, - { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 }, - { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 }, + { INTEL_FAM6_HASWELL_L, 0x01, 0x21 }, + { INTEL_FAM6_HASWELL_G, 0x01, 0x18 }, + { INTEL_FAM6_HASWELL, 0x03, 0x23 }, { INTEL_FAM6_HASWELL_X, 0x02, 0x3b }, { INTEL_FAM6_HASWELL_X, 0x04, 0x10 }, { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a }, @@ -265,9 +230,10 @@ static void early_init_intel(struct cpuinfo_x86 *c) /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ if (c->x86 == 6) { switch (c->x86_model) { - case 0x27: /* Penwell */ - case 0x35: /* Cloverview */ - case 0x4a: /* Merrifield */ + case INTEL_FAM6_ATOM_SALTWELL_MID: + case INTEL_FAM6_ATOM_SALTWELL_TABLET: + case INTEL_FAM6_ATOM_SILVERMONT_MID: + case INTEL_FAM6_ATOM_AIRMONT_NP: set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); break; default: @@ -329,7 +295,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); } - check_mpx_erratum(c); check_memory_type_self_snoop_errata(c); /* @@ -493,52 +458,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } -static void detect_vmx_virtcap(struct cpuinfo_x86 *c) -{ - /* Intel VMX MSR indicated features */ -#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 -#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000 -#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000 -#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 -#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 -#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 -#define x86_VMX_FEATURE_EPT_CAP_AD 0x00200000 - - u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; - u32 msr_vpid_cap, msr_ept_cap; - - clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW); - clear_cpu_cap(c, X86_FEATURE_VNMI); - clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); - clear_cpu_cap(c, X86_FEATURE_EPT); - clear_cpu_cap(c, X86_FEATURE_VPID); - clear_cpu_cap(c, X86_FEATURE_EPT_AD); - - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); - msr_ctl = vmx_msr_high | vmx_msr_low; - if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW) - set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); - if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI) - set_cpu_cap(c, X86_FEATURE_VNMI); - if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) { - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, - vmx_msr_low, vmx_msr_high); - msr_ctl2 = vmx_msr_high | vmx_msr_low; - if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && - (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) - set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); - if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) { - set_cpu_cap(c, X86_FEATURE_EPT); - rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, - msr_ept_cap, msr_vpid_cap); - if (msr_ept_cap & x86_VMX_FEATURE_EPT_CAP_AD) - set_cpu_cap(c, X86_FEATURE_EPT_AD); - } - if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) - set_cpu_cap(c, X86_FEATURE_VPID); - } -} - #define MSR_IA32_TME_ACTIVATE 0x982 /* Helpers to access TME_ACTIVATE MSR */ @@ -754,13 +673,17 @@ static void init_intel(struct cpuinfo_x86 *c) /* Work around errata */ srat_detect_node(c); - if (cpu_has(c, X86_FEATURE_VMX)) - detect_vmx_virtcap(c); + init_ia32_feat_ctl(c); if (cpu_has(c, X86_FEATURE_TME)) detect_tme(c); init_intel_misc_features(c); + + if (tsx_ctrl_state == TSX_CTRL_ENABLE) + tsx_enable(); + if (tsx_ctrl_state == TSX_CTRL_DISABLE) + tsx_disable(); } #ifdef CONFIG_X86_32 @@ -813,7 +736,7 @@ static const struct _tlb_table intel_tlb_table[] = { { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, - { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" }, + { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" }, { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, @@ -841,7 +764,7 @@ static const struct _tlb_table intel_tlb_table[] = { { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, - { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" }, + { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" }, { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, { 0x00, 0, 0 } }; @@ -853,8 +776,8 @@ static void intel_tlb_lookup(const unsigned char desc) return; /* look up this descriptor in the table */ - for (k = 0; intel_tlb_table[k].descriptor != desc && \ - intel_tlb_table[k].descriptor != 0; k++) + for (k = 0; intel_tlb_table[k].descriptor != desc && + intel_tlb_table[k].descriptor != 0; k++) ; if (intel_tlb_table[k].tlb_type == 0) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 6ea7fdc82f3c..b3a50d962851 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -78,6 +78,7 @@ struct smca_bank_name { static struct smca_bank_name smca_names[] = { [SMCA_LS] = { "load_store", "Load Store Unit" }, + [SMCA_LS_V2] = { "load_store", "Load Store Unit" }, [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, [SMCA_DE] = { "decode_unit", "Decode Unit" }, @@ -138,6 +139,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = { /* ZN Core (HWID=0xB0) MCA types */ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFFF }, + { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10), 0xFFFFFF }, { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF }, { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF }, @@ -266,10 +268,10 @@ static void smca_configure(unsigned int bank, unsigned int cpu) smca_set_misc_banks_map(bank, cpu); /* Return early if this bank was already initialized. */ - if (smca_banks[bank].hwid) + if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0) return; - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; } @@ -583,7 +585,7 @@ bool amd_filter_mce(struct mce *m) * - Prevent possible spurious interrupts from the IF bank on Family 0x17 * Models 0x10-0x2F due to Erratum #1114. */ -void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) +static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) { int i, num_msrs; u64 hwcr; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 743370ee4983..2c4f949611e4 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -53,8 +53,6 @@ #include "internal.h" -static DEFINE_MUTEX(mce_log_mutex); - /* sysfs synchronization */ static DEFINE_MUTEX(mce_sysfs_mutex); @@ -156,19 +154,10 @@ void mce_log(struct mce *m) if (!mce_gen_pool_add(m)) irq_work_queue(&mce_irq_work); } - -void mce_inject_log(struct mce *m) -{ - mutex_lock(&mce_log_mutex); - mce_log(m); - mutex_unlock(&mce_log_mutex); -} -EXPORT_SYMBOL_GPL(mce_inject_log); - -static struct notifier_block mce_srao_nb; +EXPORT_SYMBOL_GPL(mce_log); /* - * We run the default notifier if we have only the SRAO, the first and the + * We run the default notifier if we have only the UC, the first and the * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS * notifiers registered on the chain. */ @@ -488,8 +477,9 @@ int mce_usable_address(struct mce *m) if (!(m->status & MCI_STATUS_ADDRV)) return 0; - /* Checks after this one are Intel-specific: */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + /* Checks after this one are Intel/Zhaoxin-specific: */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) return 1; if (!(m->status & MCI_STATUS_MISCV)) @@ -507,10 +497,13 @@ EXPORT_SYMBOL_GPL(mce_usable_address); bool mce_is_memory_error(struct mce *m) { - if (m->cpuvendor == X86_VENDOR_AMD || - m->cpuvendor == X86_VENDOR_HYGON) { + switch (m->cpuvendor) { + case X86_VENDOR_AMD: + case X86_VENDOR_HYGON: return amd_mce_is_memory_error(m); - } else if (m->cpuvendor == X86_VENDOR_INTEL) { + + case X86_VENDOR_INTEL: + case X86_VENDOR_ZHAOXIN: /* * Intel SDM Volume 3B - 15.9.2 Compound Error Codes * @@ -527,9 +520,10 @@ bool mce_is_memory_error(struct mce *m) return (m->status & 0xef80) == BIT(7) || (m->status & 0xef00) == BIT(8) || (m->status & 0xeffc) == 0xc; - } - return false; + default: + return false; + } } EXPORT_SYMBOL_GPL(mce_is_memory_error); @@ -589,26 +583,29 @@ static struct notifier_block first_nb = { .priority = MCE_PRIO_FIRST, }; -static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, - void *data) +static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, + void *data) { struct mce *mce = (struct mce *)data; unsigned long pfn; - if (!mce) + if (!mce || !mce_usable_address(mce)) return NOTIFY_DONE; - if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) { - pfn = mce->addr >> PAGE_SHIFT; - if (!memory_failure(pfn, 0)) - set_mce_nospec(pfn); - } + if (mce->severity != MCE_AO_SEVERITY && + mce->severity != MCE_DEFERRED_SEVERITY) + return NOTIFY_DONE; + + pfn = mce->addr >> PAGE_SHIFT; + if (!memory_failure(pfn, 0)) + set_mce_nospec(pfn); return NOTIFY_OK; } -static struct notifier_block mce_srao_nb = { - .notifier_call = srao_decode_notifier, - .priority = MCE_PRIO_SRAO, + +static struct notifier_block mce_uc_nb = { + .notifier_call = uc_decode_notifier, + .priority = MCE_PRIO_UC, }; static int mce_default_notifier(struct notifier_block *nb, unsigned long val, @@ -758,26 +755,22 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) log_it: error_seen = true; - mce_read_aux(&m, i); + if (flags & MCP_DONTLOG) + goto clear_it; + mce_read_aux(&m, i); m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); - /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) - mce_log(&m); - else if (mce_usable_address(&m)) { - /* - * Although we skipped logging this, we still want - * to take action. Add to the pool so the registered - * notifiers will see it. - */ - if (!mce_gen_pool_add(&m)) - mce_schedule_work(); - } + if (mca_cfg.dont_log_ce && !mce_usable_address(&m)) + goto clear_it; + + mce_log(&m); + +clear_it: /* * Clear state for this bank. */ @@ -802,7 +795,7 @@ EXPORT_SYMBOL_GPL(machine_check_poll); static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, struct pt_regs *regs) { - char *tmp; + char *tmp = *msg; int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { @@ -814,8 +807,8 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, if (quirk_no_way_out) quirk_no_way_out(i, m, regs); + m->bank = i; if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { - m->bank = i; mce_read_aux(m, i); *msg = tmp; return 1; @@ -1127,6 +1120,12 @@ static bool __mc_check_crashing_cpu(int cpu) u64 mcgstatus; mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + + if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { + if (mcgstatus & MCG_STATUS_LMCES) + return false; + } + if (mcgstatus & MCG_STATUS_RIPV) { mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); return true; @@ -1221,8 +1220,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) DECLARE_BITMAP(toclear, MAX_NR_BANKS); struct mca_config *cfg = &mca_cfg; int cpu = smp_processor_id(); - char *msg = "Unknown"; struct mce m, *final; + char *msg = NULL; int worst = 0; /* @@ -1277,9 +1276,10 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* * Check if this MCE is signaled to only this logical processor, - * on Intel only. + * on Intel, Zhaoxin only. */ - if (m.cpuvendor == X86_VENDOR_INTEL) + if (m.cpuvendor == X86_VENDOR_INTEL || + m.cpuvendor == X86_VENDOR_ZHAOXIN) lmce = m.mcgstatus & MCG_STATUS_LMCES; /* @@ -1353,7 +1353,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) ist_end_non_atomic(); } else { if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) - mce_panic("Failed kernel mode recovery", &m, NULL); + mce_panic("Failed kernel mode recovery", &m, msg); } out_ist: @@ -1697,6 +1697,18 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model == 45) quirk_no_way_out = quirk_sandybridge_ifu; } + + if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { + /* + * All newer Zhaoxin CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; + } + } + if (cfg->monarch_timeout < 0) cfg->monarch_timeout = 0; if (cfg->bootlog != 0) @@ -1760,6 +1772,35 @@ static void mce_centaur_feature_init(struct cpuinfo_x86 *c) } } +static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* + * These CPUs have MCA bank 8 which reports only one error type called + * SVAD (System View Address Decoder). The reporting of that error is + * controlled by IA32_MC8.CTL.0. + * + * If enabled, prefetching on these CPUs will cause SVAD MCE when + * virtual machines start and result in a system panic. Always disable + * bank 8 SVAD error by default. + */ + if ((c->x86 == 7 && c->x86_model == 0x1b) || + (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (this_cpu_read(mce_num_banks) > 8) + mce_banks[8].ctl = 0; + } + + intel_init_cmci(); + intel_init_lmce(); + mce_adjust_timer = cmci_intel_adjust_timer; +} + +static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) +{ + intel_clear_lmce(); +} + static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { @@ -1781,6 +1822,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_centaur_feature_init(c); break; + case X86_VENDOR_ZHAOXIN: + mce_zhaoxin_feature_init(c); + break; + default: break; } @@ -1792,6 +1837,11 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) case X86_VENDOR_INTEL: mce_intel_feature_clear(c); break; + + case X86_VENDOR_ZHAOXIN: + mce_zhaoxin_feature_clear(c); + break; + default: break; } @@ -1979,7 +2029,7 @@ int __init mcheck_init(void) { mcheck_intel_therm_init(); mce_register_decode_chain(&first_nb); - mce_register_decode_chain(&mce_srao_nb); + mce_register_decode_chain(&mce_uc_nb); mce_register_decode_chain(&mce_default_nb); mcheck_vendor_init_severity(); @@ -2014,15 +2064,16 @@ static void mce_disable_error_reporting(void) static void vendor_disable_error_reporting(void) { /* - * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs - * are socket-wide. - * Disabling them for just a single offlined CPU is bad, since it will - * inhibit reporting for all shared resources on the socket like the - * last level cache (LLC), the integrated memory controller (iMC), etc. + * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these + * MSRs are socket-wide. Disabling them for just a single offlined CPU + * is bad, since it will inhibit reporting for all shared resources on + * the socket like the last level cache (LLC), the integrated memory + * controller (iMC), etc. */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON || - boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) return; mce_disable_error_reporting(); diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 1f30117b24ba..3413b41b8d55 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -494,7 +494,7 @@ static void do_inject(void) i_mce.status |= MCI_STATUS_SYNDV; if (inj_type == SW_INJ) { - mce_inject_log(&i_mce); + mce_log(&i_mce); return; } diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index e43eb6732630..5627b1091b85 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -85,8 +85,10 @@ static int cmci_supported(int *banks) * initialization is vendor keyed and this * makes sure none of the backdoors are entered otherwise. */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) return 0; + if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) return 0; rdmsrl(MSR_IA32_MCG_CAP, cap); @@ -113,15 +115,16 @@ static bool lmce_supported(void) /* * BIOS should indicate support for LMCE by setting bit 20 in - * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will - * generate a #GP fault. + * IA32_FEAT_CTL without which touching MCG_EXT_CTL will generate a #GP + * fault. The MSR must also be locked for LMCE_ENABLED to take effect. + * WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally + * locks the MSR in the event that it wasn't already locked by BIOS. */ - rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp); - if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) == - (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) - return true; + rdmsrl(MSR_IA32_FEAT_CTL, tmp); + if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED))) + return false; - return false; + return tmp & FEAT_CTL_LMCE_ENABLED; } bool mce_intel_cmci_poll(void) @@ -423,7 +426,7 @@ void cmci_disable_bank(int bank) raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } -static void intel_init_cmci(void) +void intel_init_cmci(void) { int banks; @@ -442,7 +445,7 @@ static void intel_init_cmci(void) cmci_recheck(); } -static void intel_init_lmce(void) +void intel_init_lmce(void) { u64 val; @@ -455,7 +458,7 @@ static void intel_init_lmce(void) wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); } -static void intel_clear_lmce(void) +void intel_clear_lmce(void) { u64 val; @@ -479,9 +482,10 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) switch (c->x86_model) { case INTEL_FAM6_IVYBRIDGE_X: case INTEL_FAM6_HASWELL_X: - case INTEL_FAM6_BROADWELL_XEON_D: + case INTEL_FAM6_BROADWELL_D: case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_ICELAKE_X: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM: diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 43031db429d2..b785c0d0b590 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -45,11 +45,17 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval); bool mce_intel_cmci_poll(void); void mce_intel_hcpu_update(unsigned long cpu); void cmci_disable_bank(int bank); +void intel_init_cmci(void); +void intel_init_lmce(void); +void intel_clear_lmce(void); #else # define cmci_intel_adjust_timer mce_adjust_timer_default static inline bool mce_intel_cmci_poll(void) { return false; } static inline void mce_intel_hcpu_update(unsigned long cpu) { } static inline void cmci_disable_bank(int bank) { } +static inline void intel_init_cmci(void) { } +static inline void intel_init_lmce(void) { } +static inline void intel_clear_lmce(void) { } #endif void mce_timer_kick(unsigned long interval); @@ -78,8 +84,6 @@ static inline int apei_clear_mce(u64 record_id) } #endif -void mce_inject_log(struct mce *m); - /* * We consider records to be equivalent if bank+status+addr+misc all match. * This is only used when the system is going down because of a fatal error diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 210f1f5db5f7..87bcdc6dc2f0 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -107,11 +107,11 @@ static struct severity { */ MCESEV( AO, "Action optional: memory scrubbing error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + SER, MASK(MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) ), MCESEV( AO, "Action optional: last level cache writeback error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) ), /* ignore OVER for UCNA */ diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c index 6e2becf547c5..58b4ee3cda77 100644 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ b/arch/x86/kernel/cpu/mce/therm_throt.c @@ -40,15 +40,58 @@ #define THERMAL_THROTTLING_EVENT 0 #define POWER_LIMIT_EVENT 1 -/* - * Current thermal event state: +/** + * struct _thermal_state - Represent the current thermal event state + * @next_check: Stores the next timestamp, when it is allowed + * to log the next warning message. + * @last_interrupt_time: Stores the timestamp for the last threshold + * high event. + * @therm_work: Delayed workqueue structure + * @count: Stores the current running count for thermal + * or power threshold interrupts. + * @last_count: Stores the previous running count for thermal + * or power threshold interrupts. + * @max_time_ms: This shows the maximum amount of time CPU was + * in throttled state for a single thermal + * threshold high to low state. + * @total_time_ms: This is a cumulative time during which CPU was + * in the throttled state. + * @rate_control_active: Set when a throttling message is logged. + * This is used for the purpose of rate-control. + * @new_event: Stores the last high/low status of the + * THERM_STATUS_PROCHOT or + * THERM_STATUS_POWER_LIMIT. + * @level: Stores whether this _thermal_state instance is + * for a CORE level or for PACKAGE level. + * @sample_index: Index for storing the next sample in the buffer + * temp_samples[]. + * @sample_count: Total number of samples collected in the buffer + * temp_samples[]. + * @average: The last moving average of temperature samples + * @baseline_temp: Temperature at which thermal threshold high + * interrupt was generated. + * @temp_samples: Storage for temperature samples to calculate + * moving average. + * + * This structure is used to represent data related to thermal state for a CPU. + * There is a separate storage for core and package level for each CPU. */ struct _thermal_state { - bool new_event; - int event; u64 next_check; + u64 last_interrupt_time; + struct delayed_work therm_work; unsigned long count; unsigned long last_count; + unsigned long max_time_ms; + unsigned long total_time_ms; + bool rate_control_active; + bool new_event; + u8 level; + u8 sample_index; + u8 sample_count; + u8 average; + u8 baseline_temp; + u8 temp_samples[3]; }; struct thermal_state { @@ -121,8 +164,22 @@ define_therm_throt_device_one_ro(package_throttle_count); define_therm_throt_device_show_func(package_power_limit, count); define_therm_throt_device_one_ro(package_power_limit_count); +define_therm_throt_device_show_func(core_throttle, max_time_ms); +define_therm_throt_device_one_ro(core_throttle_max_time_ms); + +define_therm_throt_device_show_func(package_throttle, max_time_ms); +define_therm_throt_device_one_ro(package_throttle_max_time_ms); + +define_therm_throt_device_show_func(core_throttle, total_time_ms); +define_therm_throt_device_one_ro(core_throttle_total_time_ms); + +define_therm_throt_device_show_func(package_throttle, total_time_ms); +define_therm_throt_device_one_ro(package_throttle_total_time_ms); + static struct attribute *thermal_throttle_attrs[] = { &dev_attr_core_throttle_count.attr, + &dev_attr_core_throttle_max_time_ms.attr, + &dev_attr_core_throttle_total_time_ms.attr, NULL }; @@ -135,6 +192,112 @@ static const struct attribute_group thermal_attr_group = { #define CORE_LEVEL 0 #define PACKAGE_LEVEL 1 +#define THERM_THROT_POLL_INTERVAL HZ +#define THERM_STATUS_PROCHOT_LOG BIT(1) + +#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15)) +#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11)) + +static void clear_therm_status_log(int level) +{ + int msr; + u64 mask, msr_val; + + if (level == CORE_LEVEL) { + msr = MSR_IA32_THERM_STATUS; + mask = THERM_STATUS_CLEAR_CORE_MASK; + } else { + msr = MSR_IA32_PACKAGE_THERM_STATUS; + mask = THERM_STATUS_CLEAR_PKG_MASK; + } + + rdmsrl(msr, msr_val); + msr_val &= mask; + wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG); +} + +static void get_therm_status(int level, bool *proc_hot, u8 *temp) +{ + int msr; + u64 msr_val; + + if (level == CORE_LEVEL) + msr = MSR_IA32_THERM_STATUS; + else + msr = MSR_IA32_PACKAGE_THERM_STATUS; + + rdmsrl(msr, msr_val); + if (msr_val & THERM_STATUS_PROCHOT_LOG) + *proc_hot = true; + else + *proc_hot = false; + + *temp = (msr_val >> 16) & 0x7F; +} + +static void __maybe_unused throttle_active_work(struct work_struct *work) +{ + struct _thermal_state *state = container_of(to_delayed_work(work), + struct _thermal_state, therm_work); + unsigned int i, avg, this_cpu = smp_processor_id(); + u64 now = get_jiffies_64(); + bool hot; + u8 temp; + + get_therm_status(state->level, &hot, &temp); + /* temperature value is offset from the max so lesser means hotter */ + if (!hot && temp > state->baseline_temp) { + if (state->rate_control_active) + pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n", + this_cpu, + state->level == CORE_LEVEL ? "Core" : "Package", + state->count); + + state->rate_control_active = false; + return; + } + + if (time_before64(now, state->next_check) && + state->rate_control_active) + goto re_arm; + + state->next_check = now + CHECK_INTERVAL; + + if (state->count != state->last_count) { + /* There was one new thermal interrupt */ + state->last_count = state->count; + state->average = 0; + state->sample_count = 0; + state->sample_index = 0; + } + + state->temp_samples[state->sample_index] = temp; + state->sample_count++; + state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples); + if (state->sample_count < ARRAY_SIZE(state->temp_samples)) + goto re_arm; + + avg = 0; + for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i) + avg += state->temp_samples[i]; + + avg /= ARRAY_SIZE(state->temp_samples); + + if (state->average > avg) { + pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n", + this_cpu, + state->level == CORE_LEVEL ? "Core" : "Package", + state->count); + state->rate_control_active = true; + } + + state->average = avg; + +re_arm: + clear_therm_status_log(state->level); + schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); +} + /*** * therm_throt_process - Process thermal throttling event from interrupt * @curr: Whether the condition is current or not (boolean), since the @@ -178,27 +341,33 @@ static void therm_throt_process(bool new_event, int event, int level) if (new_event) state->count++; - if (time_before64(now, state->next_check) && - state->count != state->last_count) + if (event != THERMAL_THROTTLING_EVENT) return; - state->next_check = now + CHECK_INTERVAL; - state->last_count = state->count; + if (new_event && !state->last_interrupt_time) { + bool hot; + u8 temp; + + get_therm_status(state->level, &hot, &temp); + /* + * Ignore short temperature spike as the system is not close + * to PROCHOT. 10C offset is large enough to ignore. It is + * already dropped from the high threshold temperature. + */ + if (temp > 10) + return; - /* if we just entered the thermal event */ - if (new_event) { - if (event == THERMAL_THROTTLING_EVENT) - pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package", - state->count); - return; - } - if (old_event) { - if (event == THERMAL_THROTTLING_EVENT) - pr_info("CPU%d: %s temperature/speed normal\n", this_cpu, - level == CORE_LEVEL ? "Core" : "Package"); - return; + state->baseline_temp = temp; + state->last_interrupt_time = now; + schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); + } else if (old_event && state->last_interrupt_time) { + unsigned long throttle_time; + + throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time); + if (throttle_time > state->max_time_ms) + state->max_time_ms = throttle_time; + state->total_time_ms += throttle_time; + state->last_interrupt_time = 0; } } @@ -244,20 +413,47 @@ static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) if (err) return err; - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { err = sysfs_add_file_to_group(&dev->kobj, &dev_attr_core_power_limit_count.attr, thermal_attr_group.name); + if (err) + goto del_group; + } + if (cpu_has(c, X86_FEATURE_PTS)) { err = sysfs_add_file_to_group(&dev->kobj, &dev_attr_package_throttle_count.attr, thermal_attr_group.name); - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + if (err) + goto del_group; + + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_max_time_ms.attr, + thermal_attr_group.name); + if (err) + goto del_group; + + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_total_time_ms.attr, + thermal_attr_group.name); + if (err) + goto del_group; + + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { err = sysfs_add_file_to_group(&dev->kobj, &dev_attr_package_power_limit_count.attr, thermal_attr_group.name); + if (err) + goto del_group; + } } + return 0; + +del_group: + sysfs_remove_group(&dev->kobj, &thermal_attr_group); + return err; } @@ -269,15 +465,34 @@ static void thermal_throttle_remove_dev(struct device *dev) /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int thermal_throttle_online(unsigned int cpu) { + struct thermal_state *state = &per_cpu(thermal_state, cpu); struct device *dev = get_cpu_device(cpu); + u32 l; + + state->package_throttle.level = PACKAGE_LEVEL; + state->core_throttle.level = CORE_LEVEL; + + INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work); + INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work); + + /* Unmask the thermal vector after the above workqueues are initialized. */ + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); return thermal_throttle_add_dev(dev, cpu); } static int thermal_throttle_offline(unsigned int cpu) { + struct thermal_state *state = &per_cpu(thermal_state, cpu); struct device *dev = get_cpu_device(cpu); + cancel_delayed_work(&state->package_throttle.therm_work); + cancel_delayed_work(&state->core_throttle.therm_work); + + state->package_throttle.rate_control_active = false; + state->core_throttle.rate_control_active = false; + thermal_throttle_remove_dev(dev); return 0; } @@ -512,10 +727,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c) rdmsr(MSR_IA32_MISC_ENABLE, l, h); wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - /* Unmask the thermal vector: */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - pr_info_once("CPU0: Thermal monitoring enabled (%s)\n", tm2 ? "TM2" : "TM1"); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index a0e52bd00ecc..3f6b137ef4e6 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -567,7 +567,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) void reload_ucode_amd(void) { struct microcode_amd *mc; - u32 rev, dummy; + u32 rev, dummy __always_unused; mc = (struct microcode_amd *)amd_ucode_patch; @@ -673,7 +673,7 @@ static enum ucode_state apply_microcode_amd(int cpu) struct ucode_cpu_info *uci; struct ucode_patch *p; enum ucode_state ret; - u32 rev, dummy; + u32 rev, dummy __always_unused; BUG_ON(raw_smp_processor_id() != cpu); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index cb0fdcaf1415..7019d4b2df0c 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -63,11 +63,6 @@ LIST_HEAD(microcode_cache); */ static DEFINE_MUTEX(microcode_mutex); -/* - * Serialize late loading so that CPUs get updated one-by-one. - */ -static DEFINE_RAW_SPINLOCK(update_lock); - struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; struct cpu_info_ctx { @@ -566,11 +561,18 @@ static int __reload_late(void *info) if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC)) return -1; - raw_spin_lock(&update_lock); - apply_microcode_local(&err); - raw_spin_unlock(&update_lock); + /* + * On an SMT system, it suffices to load the microcode on one sibling of + * the core because the microcode engine is shared between the threads. + * Synchronization still needs to take place so that no concurrent + * loading attempts happen on multiple threads of an SMT core. See + * below. + */ + if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu) + apply_microcode_local(&err); + else + goto wait_for_siblings; - /* siblings return UCODE_OK because their engine got updated already */ if (err > UCODE_NFOUND) { pr_warn("Error reloading microcode on CPU %d\n", cpu); ret = -1; @@ -578,14 +580,18 @@ static int __reload_late(void *info) ret = 1; } +wait_for_siblings: + if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC)) + panic("Timeout during microcode update!\n"); + /* - * Increase the wait timeout to a safe value here since we're - * serializing the microcode update and that could take a while on a - * large number of CPUs. And that is fine as the *actual* timeout will - * be determined by the last CPU finished updating and thus cut short. + * At least one thread has completed update on each core. + * For others, simply call the update to make sure the + * per-cpu cpuinfo can be updated with right microcode + * revision. */ - if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus())) - panic("Timeout during microcode update!\n"); + if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu) + apply_microcode_local(&err); return ret; } diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index ce799cfe9434..6a99535d7f37 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -791,6 +791,7 @@ static enum ucode_state apply_microcode_intel(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct cpuinfo_x86 *c = &cpu_data(cpu); + bool bsp = c->cpu_index == boot_cpu_data.cpu_index; struct microcode_intel *mc; enum ucode_state ret; static int prev_rev; @@ -836,7 +837,7 @@ static enum ucode_state apply_microcode_intel(int cpu) return UCODE_ERROR; } - if (rev != prev_rev) { + if (bsp && rev != prev_rev) { pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", rev, mc->hdr.date & 0xffff, @@ -852,7 +853,7 @@ out: c->microcode = rev; /* Update boot_cpu_data's revision too, if we're on the BSP: */ - if (c->cpu_index == boot_cpu_data.cpu_index) + if (bsp) boot_cpu_data.microcode = rev; return ret; diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index aed45b8895d5..1db560ed2ca3 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -6,8 +6,7 @@ set -e -IN=$1 -OUT=$2 +OUT=$1 dump_array() { @@ -15,6 +14,7 @@ dump_array() SIZE=$2 PFX=$3 POSTFIX=$4 + IN=$5 PFX_SZ=$(echo $PFX | wc -c) TABS="$(printf '\t\t\t\t\t')" @@ -57,11 +57,18 @@ trap 'rm "$OUT"' EXIT echo "#endif" echo "" - dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" "" + dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" "" $2 echo "" - dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32" + dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32" $2 + echo "" + echo "#ifdef CONFIG_X86_VMX_FEATURE_NAMES" + echo "#ifndef _ASM_X86_VMXFEATURES_H" + echo "#include <asm/vmxfeatures.h>" + echo "#endif" + dump_array "x86_vmx_flags" "NVMXINTS*32" "VMX_FEATURE_" "" $3 + echo "#endif /* CONFIG_X86_VMX_FEATURE_NAMES */" ) > $OUT trap - EXIT diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 062f77279ce3..caa032ce3fe3 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -29,6 +29,7 @@ #include <asm/timer.h> #include <asm/reboot.h> #include <asm/nmi.h> +#include <clocksource/hyperv_timer.h> struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); @@ -215,6 +216,10 @@ static void __init ms_hyperv_init_platform(void) int hv_host_info_ecx; int hv_host_info_edx; +#ifdef CONFIG_PARAVIRT + pv_info.name = "Hyper-V"; +#endif + /* * Extract the features and hints */ @@ -285,7 +290,12 @@ static void __init ms_hyperv_init_platform(void) machine_ops.shutdown = hv_machine_shutdown; machine_ops.crash_shutdown = hv_machine_crash_shutdown; #endif - mark_tsc_unstable("running on Hyper-V"); + if (ms_hyperv.features & HV_X64_ACCESS_TSC_INVARIANT) { + wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + } else { + mark_tsc_unstable("running on Hyper-V"); + } /* * Generation 2 instances don't support reading the NMI status from @@ -338,6 +348,15 @@ static void __init ms_hyperv_init_platform(void) x2apic_phys = 1; # endif + /* Register Hyper-V specific clocksource */ + hv_init_clocksource(); +#endif +} + +void hv_setup_sched_clock(void *sched_clock) +{ +#ifdef CONFIG_PARAVIRT + pv_ops.time.sched_clock = sched_clock; #endif } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index aa5c064a6a22..51b9190c628b 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -15,7 +15,7 @@ #include <asm/tlbflush.h> #include <asm/mtrr.h> #include <asm/msr.h> -#include <asm/pat.h> +#include <asm/memtype.h> #include "mtrr.h" diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 4d36dcc1cf87..a5c506f6da7f 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -101,9 +101,6 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) int length; size_t linelen; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - memset(line, 0, LINE_SIZE); len = min_t(size_t, len, LINE_SIZE - 1); @@ -226,8 +223,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #ifdef CONFIG_COMPAT case MTRRIOC32_ADD_ENTRY: #endif - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; err = mtrr_file_add(sentry.base, sentry.size, sentry.type, true, file, 0); @@ -236,24 +231,18 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #ifdef CONFIG_COMPAT case MTRRIOC32_SET_ENTRY: #endif - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; err = mtrr_add(sentry.base, sentry.size, sentry.type, false); break; case MTRRIOC_DEL_ENTRY: #ifdef CONFIG_COMPAT case MTRRIOC32_DEL_ENTRY: #endif - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; err = mtrr_file_del(sentry.base, sentry.size, file, 0); break; case MTRRIOC_KILL_ENTRY: #ifdef CONFIG_COMPAT case MTRRIOC32_KILL_ENTRY: #endif - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; err = mtrr_del(-1, sentry.base, sentry.size); break; case MTRRIOC_GET_ENTRY: @@ -279,8 +268,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #ifdef CONFIG_COMPAT case MTRRIOC32_ADD_PAGE_ENTRY: #endif - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; err = mtrr_file_add(sentry.base, sentry.size, sentry.type, true, file, 1); @@ -289,8 +276,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #ifdef CONFIG_COMPAT case MTRRIOC32_SET_PAGE_ENTRY: #endif - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; err = mtrr_add_page(sentry.base, sentry.size, sentry.type, false); break; @@ -298,16 +283,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #ifdef CONFIG_COMPAT case MTRRIOC32_DEL_PAGE_ENTRY: #endif - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; err = mtrr_file_del(sentry.base, sentry.size, file, 1); break; case MTRRIOC_KILL_PAGE_ENTRY: #ifdef CONFIG_COMPAT case MTRRIOC32_KILL_PAGE_ENTRY: #endif - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; err = mtrr_del_page(-1, sentry.base, sentry.size); break; case MTRRIOC_GET_PAGE_ENTRY: @@ -373,28 +354,6 @@ static int mtrr_close(struct inode *ino, struct file *file) return single_release(ino, file); } -static int mtrr_seq_show(struct seq_file *seq, void *offset); - -static int mtrr_open(struct inode *inode, struct file *file) -{ - if (!mtrr_if) - return -EIO; - if (!mtrr_if->get) - return -ENXIO; - return single_open(file, mtrr_seq_show, NULL); -} - -static const struct file_operations mtrr_fops = { - .owner = THIS_MODULE, - .open = mtrr_open, - .read = seq_read, - .llseek = seq_lseek, - .write = mtrr_write, - .unlocked_ioctl = mtrr_ioctl, - .compat_ioctl = mtrr_ioctl, - .release = mtrr_close, -}; - static int mtrr_seq_show(struct seq_file *seq, void *offset) { char factor; @@ -426,6 +385,29 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) return 0; } +static int mtrr_open(struct inode *inode, struct file *file) +{ + if (!mtrr_if) + return -EIO; + if (!mtrr_if->get) + return -ENXIO; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + return single_open(file, mtrr_seq_show, NULL); +} + +static const struct proc_ops mtrr_proc_ops = { + .proc_open = mtrr_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = mtrr_write, + .proc_ioctl = mtrr_ioctl, +#ifdef CONFIG_COMPAT + .proc_compat_ioctl = mtrr_ioctl, +#endif + .proc_release = mtrr_close, +}; + static int __init mtrr_if_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -436,7 +418,7 @@ static int __init mtrr_if_init(void) (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) return -ENODEV; - proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops); + proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_proc_ops); return 0; } arch_initcall(mtrr_if_init); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 507039c20128..6a80f36b5d59 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -52,7 +52,7 @@ #include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/msr.h> -#include <asm/pat.h> +#include <asm/memtype.h> #include "mtrr.h" diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index cb2e49810d68..4eec8889b0ff 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -7,6 +7,10 @@ #include "cpu.h" +#ifdef CONFIG_X86_VMX_FEATURE_NAMES +extern const char * const x86_vmx_flags[NVMXINTS*32]; +#endif + /* * Get CPU information for use by the procfs. */ @@ -102,6 +106,17 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (cpu_has(c, i) && x86_cap_flags[i] != NULL) seq_printf(m, " %s", x86_cap_flags[i]); +#ifdef CONFIG_X86_VMX_FEATURE_NAMES + if (cpu_has(c, X86_FEATURE_VMX) && c->vmx_capability[0]) { + seq_puts(m, "\nvmx flags\t:"); + for (i = 0; i < 32*NVMXINTS; i++) { + if (test_bit(i, (unsigned long *)c->vmx_capability) && + x86_vmx_flags[i] != NULL) + seq_printf(m, " %s", x86_vmx_flags[i]); + } + } +#endif + seq_puts(m, "\nbugs\t\t:"); for (i = 0; i < 32*NBUGINTS; i++) { unsigned int bug_bit = 32*NCAPINTS + i; diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 5c900f9527ff..c4be62058dd9 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -29,7 +29,8 @@ __setup("nordrand", x86_rdrand_setup); #ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { - unsigned long tmp; + unsigned int changed = 0; + unsigned long tmp, prev; int i; if (!cpu_has(c, X86_FEATURE_RDRAND)) @@ -42,5 +43,24 @@ void x86_init_rdrand(struct cpuinfo_x86 *c) return; } } + + /* + * Stupid sanity-check whether RDRAND does *actually* generate + * some at least random-looking data. + */ + prev = tmp; + for (i = 0; i < SANITY_CHECK_LOOPS; i++) { + if (rdrand_long(&tmp)) { + if (prev != tmp) + changed++; + + prev = tmp; + } + } + + if (WARN_ON_ONCE(!changed)) + pr_emerg( +"RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\""); + } #endif diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 03eb90d00af0..89049b343c7a 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -618,7 +618,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) if (static_branch_unlikely(&rdt_mon_enable_key)) rmdir_mondata_subdir_allrdtgrp(r, d->id); list_del(&d->list); - if (is_mbm_enabled()) + if (r->mon_capable && is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { /* diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index efbd54cc4e69..055c8613b531 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -522,6 +522,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) int ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out; + } md.priv = of->kn->priv; resid = md.u.rid; diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index e49b77283924..181c992f448c 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -57,6 +57,7 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) } DECLARE_STATIC_KEY_FALSE(rdt_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); /** * struct mon_evt - Entry in the event list of a resource diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 397206f23d14..773124b0e18a 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -514,7 +514,7 @@ void mbm_handle_overflow(struct work_struct *work) mutex_lock(&rdtgroup_mutex); - if (!static_branch_likely(&rdt_enable_key)) + if (!static_branch_likely(&rdt_mon_enable_key)) goto out_unlock; d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); @@ -543,7 +543,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - if (!static_branch_likely(&rdt_enable_key)) + if (!static_branch_likely(&rdt_mon_enable_key)) return; cpu = cpumask_any(&dom->cpu_mask); dom->mbm_work_cpu = cpu; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index a46dee8e78db..064e9ef44cd6 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -461,10 +461,8 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, } rdtgrp = rdtgroup_kn_lock_live(of->kn); - rdt_last_cmd_clear(); if (!rdtgrp) { ret = -ENOENT; - rdt_last_cmd_puts("Directory was removed\n"); goto unlock; } @@ -534,11 +532,15 @@ static void move_myself(struct callback_head *head) kfree(rdtgrp); } + if (unlikely(current->flags & PF_EXITING)) + goto out; + preempt_disable(); /* update PQR_ASSOC MSR to make resource group go into effect */ resctrl_sched_in(); preempt_enable(); +out: kfree(callback); } @@ -727,6 +729,92 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, return ret; } +#ifdef CONFIG_PROC_CPU_RESCTRL + +/* + * A task can only be part of one resctrl control group and of one monitor + * group which is associated to that control group. + * + * 1) res: + * mon: + * + * resctrl is not available. + * + * 2) res:/ + * mon: + * + * Task is part of the root resctrl control group, and it is not associated + * to any monitor group. + * + * 3) res:/ + * mon:mon0 + * + * Task is part of the root resctrl control group and monitor group mon0. + * + * 4) res:group0 + * mon: + * + * Task is part of resctrl control group group0, and it is not associated + * to any monitor group. + * + * 5) res:group0 + * mon:mon1 + * + * Task is part of resctrl control group group0 and monitor group mon1. + */ +int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) +{ + struct rdtgroup *rdtg; + int ret = 0; + + mutex_lock(&rdtgroup_mutex); + + /* Return empty if resctrl has not been mounted. */ + if (!static_branch_unlikely(&rdt_enable_key)) { + seq_puts(s, "res:\nmon:\n"); + goto unlock; + } + + list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { + struct rdtgroup *crg; + + /* + * Task information is only relevant for shareable + * and exclusive groups. + */ + if (rdtg->mode != RDT_MODE_SHAREABLE && + rdtg->mode != RDT_MODE_EXCLUSIVE) + continue; + + if (rdtg->closid != tsk->closid) + continue; + + seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", + rdtg->kn->name); + seq_puts(s, "mon:"); + list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, + mon.crdtgrp_list) { + if (tsk->rmid != crg->mon.rmid) + continue; + seq_printf(s, "%s", crg->kn->name); + break; + } + seq_putc(s, '\n'); + goto unlock; + } + /* + * The above search should succeed. Otherwise return + * with an error. + */ + ret = -ENOENT; +unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} +#endif + static int rdt_last_cmd_status_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -1743,9 +1831,6 @@ static int set_cache_qos_cfg(int level, bool enable) struct rdt_domain *d; int cpu; - if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) - return -ENOMEM; - if (level == RDT_RESOURCE_L3) update = l3_qos_cfg_update; else if (level == RDT_RESOURCE_L2) @@ -1753,6 +1838,9 @@ static int set_cache_qos_cfg(int level, bool enable) else return -EINVAL; + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + r_l = &rdt_resources_all[level]; list_for_each_entry(d, &r_l->domains, list) { /* Pick one CPU from each domain instance to update MSR */ @@ -1972,7 +2060,7 @@ static int rdt_get_tree(struct fs_context *fc) if (rdt_mon_capable) { ret = mongroup_create_dir(rdtgroup_default.kn, - NULL, "mon_groups", + &rdtgroup_default, "mon_groups", &kn_mongrp); if (ret < 0) goto out_info; @@ -2039,25 +2127,20 @@ enum rdt_param { nr__rdt_params }; -static const struct fs_parameter_spec rdt_param_specs[] = { +static const struct fs_parameter_spec rdt_fs_parameters[] = { fsparam_flag("cdp", Opt_cdp), fsparam_flag("cdpl2", Opt_cdpl2), fsparam_flag("mba_MBps", Opt_mba_mbps), {} }; -static const struct fs_parameter_description rdt_fs_parameters = { - .name = "rdt", - .specs = rdt_param_specs, -}; - static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct rdt_fs_context *ctx = rdt_fc2context(fc); struct fs_parse_result result; int opt; - opt = fs_parse(fc, &rdt_fs_parameters, param, &result); + opt = fs_parse(fc, rdt_fs_parameters, param, &result); if (opt < 0) return opt; @@ -2207,7 +2290,11 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { free_rmid(sentry->mon.rmid); list_del(&sentry->mon.crdtgrp_list); - kfree(sentry); + + if (atomic_read(&sentry->waitcount) != 0) + sentry->flags = RDT_DELETED; + else + kfree(sentry); } } @@ -2245,7 +2332,11 @@ static void rmdir_all_sub(void) kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); - kfree(rdtgrp); + + if (atomic_read(&rdtgrp->waitcount) != 0) + rdtgrp->flags = RDT_DELETED; + else + kfree(rdtgrp); } /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ update_closid_rmid(cpu_online_mask, &rdtgroup_default); @@ -2282,7 +2373,7 @@ static void rdt_kill_sb(struct super_block *sb) static struct file_system_type rdt_fs_type = { .name = "resctrl", .init_fs_context = rdt_init_fs_context, - .parameters = &rdt_fs_parameters, + .parameters = rdt_fs_parameters, .kill_sb = rdt_kill_sb, }; @@ -2448,7 +2539,7 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn, /* * Create the mon_data directory first. */ - ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn); + ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); if (ret) return ret; @@ -2638,7 +2729,6 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) } static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, - struct kernfs_node *prgrp_kn, const char *name, umode_t mode, enum rdt_group_type rtype, struct rdtgroup **r) { @@ -2647,11 +2737,9 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, uint files = 0; int ret; - prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); - rdt_last_cmd_clear(); + prdtgrp = rdtgroup_kn_lock_live(parent_kn); if (!prdtgrp) { ret = -ENODEV; - rdt_last_cmd_puts("Directory was removed\n"); goto out_unlock; } @@ -2722,7 +2810,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, kernfs_activate(kn); /* - * The caller unlocks the prgrp_kn upon success. + * The caller unlocks the parent_kn upon success. */ return 0; @@ -2733,7 +2821,7 @@ out_destroy: out_free_rgrp: kfree(rdtgrp); out_unlock: - rdtgroup_kn_unlock(prgrp_kn); + rdtgroup_kn_unlock(parent_kn); return ret; } @@ -2750,15 +2838,12 @@ static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) * to monitor a subset of tasks and cpus in its parent ctrl_mon group. */ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, - struct kernfs_node *prgrp_kn, - const char *name, - umode_t mode) + const char *name, umode_t mode) { struct rdtgroup *rdtgrp, *prgrp; int ret; - ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP, - &rdtgrp); + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); if (ret) return ret; @@ -2771,7 +2856,7 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, */ list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); - rdtgroup_kn_unlock(prgrp_kn); + rdtgroup_kn_unlock(parent_kn); return ret; } @@ -2780,7 +2865,6 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, * to allocate and monitor resources. */ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, - struct kernfs_node *prgrp_kn, const char *name, umode_t mode) { struct rdtgroup *rdtgrp; @@ -2788,8 +2872,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, u32 closid; int ret; - ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP, - &rdtgrp); + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); if (ret) return ret; @@ -2814,7 +2897,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, * Create an empty mon_groups directory to hold the subset * of tasks and cpus to monitor. */ - ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); + ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); if (ret) { rdt_last_cmd_puts("kernfs subdir error\n"); goto out_del_list; @@ -2830,7 +2913,7 @@ out_id_free: out_common_fail: mkdir_rdt_prepare_clean(rdtgrp); out_unlock: - rdtgroup_kn_unlock(prgrp_kn); + rdtgroup_kn_unlock(parent_kn); return ret; } @@ -2863,14 +2946,14 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, * subdirectory */ if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) - return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode); + return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); /* * If RDT monitoring is supported and the parent directory is a valid * "mon_groups" directory, add a monitoring subdirectory. */ if (rdt_mon_capable && is_mon_groups(parent_kn, name)) - return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode); + return rdtgroup_mkdir_mon(parent_kn, name, mode); return -EPERM; } @@ -2956,13 +3039,13 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, closid_free(rdtgrp->closid); free_rmid(rdtgrp->mon.rmid); + rdtgroup_ctrl_remove(kn, rdtgrp); + /* * Free all the child monitor group rmids. */ free_all_child_rdtgrp(rdtgrp); - rdtgroup_ctrl_remove(kn, rdtgrp); - return 0; } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index adf9b71386ef..62b137c3c97a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -4,7 +4,7 @@ */ #include <linux/cpu.h> -#include <asm/pat.h> +#include <asm/memtype.h> #include <asm/apic.h> #include <asm/processor.h> diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index ee48c3fc8a65..d3a0791bc052 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -7,7 +7,7 @@ #include <linux/cpu.h> #include <asm/apic.h> -#include <asm/pat.h> +#include <asm/memtype.h> #include <asm/processor.h> #include "cpu.h" diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c new file mode 100644 index 000000000000..e2ad30e474f8 --- /dev/null +++ b/arch/x86/kernel/cpu/tsx.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Transactional Synchronization Extensions (TSX) control. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Pawan Gupta <pawan.kumar.gupta@linux.intel.com> + */ + +#include <linux/cpufeature.h> + +#include <asm/cmdline.h> + +#include "cpu.h" + +#undef pr_fmt +#define pr_fmt(fmt) "tsx: " fmt + +enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; + +void tsx_disable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Force all transactions to immediately abort */ + tsx |= TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is not enumerated in CPUID. + * This is visible to userspace and will ensure they + * do not waste resources trying TSX transactions that + * will always abort. + */ + tsx |= TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +void tsx_enable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Enable the RTM feature in the cpu */ + tsx &= ~TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is enumerated in CPUID. + * This is visible to userspace and will ensure they + * can enumerate and use the TSX feature. + */ + tsx &= ~TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +static bool __init tsx_ctrl_is_supported(void) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. + * + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, + * tsx= cmdline requests will do nothing on CPUs without + * MSR_IA32_TSX_CTRL support. + */ + return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); +} + +static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) +{ + if (boot_cpu_has_bug(X86_BUG_TAA)) + return TSX_CTRL_DISABLE; + + return TSX_CTRL_ENABLE; +} + +void __init tsx_init(void) +{ + char arg[5] = {}; + int ret; + + if (!tsx_ctrl_is_supported()) + return; + + ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); + if (ret >= 0) { + if (!strcmp(arg, "on")) { + tsx_ctrl_state = TSX_CTRL_ENABLE; + } else if (!strcmp(arg, "off")) { + tsx_ctrl_state = TSX_CTRL_DISABLE; + } else if (!strcmp(arg, "auto")) { + tsx_ctrl_state = x86_get_tsx_auto_mode(); + } else { + tsx_ctrl_state = TSX_CTRL_DISABLE; + pr_err("invalid option, defaulting to off\n"); + } + } else { + /* tsx= not provided */ + if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) + tsx_ctrl_state = x86_get_tsx_auto_mode(); + else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) + tsx_ctrl_state = TSX_CTRL_DISABLE; + else + tsx_ctrl_state = TSX_CTRL_ENABLE; + } + + if (tsx_ctrl_state == TSX_CTRL_DISABLE) { + tsx_disable(); + + /* + * tsx_disable() will change the state of the RTM and HLE CPUID + * bits. Clear them here since they are now expected to be not + * set. + */ + setup_clear_cpu_cap(X86_FEATURE_RTM); + setup_clear_cpu_cap(X86_FEATURE_HLE); + } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) { + + /* + * HW defaults TSX to be enabled at bootup. + * We may still need the TSX enable support + * during init for special cases like + * kexec after TSX is disabled. + */ + tsx_enable(); + + /* + * tsx_enable() will change the state of the RTM and HLE CPUID + * bits. Force them here since they are now expected to be set. + */ + setup_force_cpu_cap(X86_FEATURE_RTM); + setup_force_cpu_cap(X86_FEATURE_HLE); + } +} diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index 32b4dc9030aa..c222f283b456 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -17,6 +17,12 @@ */ static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE); +u32 get_umwait_control_msr(void) +{ + return umwait_control_cached; +} +EXPORT_SYMBOL_GPL(get_umwait_control_msr); + /* * Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by * hardware or BIOS before kernel boot. diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 3c648476d4fb..46d732696c1c 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -30,34 +30,69 @@ #include <asm/hypervisor.h> #include <asm/timer.h> #include <asm/apic.h> +#include <asm/vmware.h> #undef pr_fmt #define pr_fmt(fmt) "vmware: " fmt -#define CPUID_VMWARE_INFO_LEAF 0x40000000 +#define CPUID_VMWARE_INFO_LEAF 0x40000000 +#define CPUID_VMWARE_FEATURES_LEAF 0x40000010 +#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0) +#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1) + #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 -#define VMWARE_HYPERVISOR_PORT 0x5658 -#define VMWARE_PORT_CMD_GETVERSION 10 -#define VMWARE_PORT_CMD_GETHZ 45 -#define VMWARE_PORT_CMD_GETVCPU_INFO 68 -#define VMWARE_PORT_CMD_LEGACY_X2APIC 3 -#define VMWARE_PORT_CMD_VCPU_RESERVED 31 +#define VMWARE_CMD_GETVERSION 10 +#define VMWARE_CMD_GETHZ 45 +#define VMWARE_CMD_GETVCPU_INFO 68 +#define VMWARE_CMD_LEGACY_X2APIC 3 +#define VMWARE_CMD_VCPU_RESERVED 31 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ - __asm__("inl (%%dx)" : \ - "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ - "0"(VMWARE_HYPERVISOR_MAGIC), \ - "1"(VMWARE_PORT_CMD_##cmd), \ - "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \ - "memory"); + __asm__("inl (%%dx), %%eax" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "a"(VMWARE_HYPERVISOR_MAGIC), \ + "c"(VMWARE_CMD_##cmd), \ + "d"(VMWARE_HYPERVISOR_PORT), "b"(UINT_MAX) : \ + "memory") + +#define VMWARE_VMCALL(cmd, eax, ebx, ecx, edx) \ + __asm__("vmcall" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "a"(VMWARE_HYPERVISOR_MAGIC), \ + "c"(VMWARE_CMD_##cmd), \ + "d"(0), "b"(UINT_MAX) : \ + "memory") + +#define VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx) \ + __asm__("vmmcall" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "a"(VMWARE_HYPERVISOR_MAGIC), \ + "c"(VMWARE_CMD_##cmd), \ + "d"(0), "b"(UINT_MAX) : \ + "memory") + +#define VMWARE_CMD(cmd, eax, ebx, ecx, edx) do { \ + switch (vmware_hypercall_mode) { \ + case CPUID_VMWARE_FEATURES_ECX_VMCALL: \ + VMWARE_VMCALL(cmd, eax, ebx, ecx, edx); \ + break; \ + case CPUID_VMWARE_FEATURES_ECX_VMMCALL: \ + VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx); \ + break; \ + default: \ + VMWARE_PORT(cmd, eax, ebx, ecx, edx); \ + break; \ + } \ + } while (0) static unsigned long vmware_tsc_khz __ro_after_init; +static u8 vmware_hypercall_mode __ro_after_init; static inline int __vmware_platform(void) { uint32_t eax, ebx, ecx, edx; - VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx); + VMWARE_CMD(GETVERSION, eax, ebx, ecx, edx); return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; } @@ -129,6 +164,10 @@ static void __init vmware_set_capabilities(void) { setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL) + setup_force_cpu_cap(X86_FEATURE_VMCALL); + else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL) + setup_force_cpu_cap(X86_FEATURE_VMW_VMMCALL); } static void __init vmware_platform_setup(void) @@ -136,7 +175,7 @@ static void __init vmware_platform_setup(void) uint32_t eax, ebx, ecx, edx; uint64_t lpj, tsc_khz; - VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + VMWARE_CMD(GETHZ, eax, ebx, ecx, edx); if (ebx != UINT_MAX) { lpj = tsc_khz = eax | (((uint64_t)ebx) << 32); @@ -174,10 +213,21 @@ static void __init vmware_platform_setup(void) vmware_set_capabilities(); } +static u8 vmware_select_hypercall(void) +{ + int eax, ebx, ecx, edx; + + cpuid(CPUID_VMWARE_FEATURES_LEAF, &eax, &ebx, &ecx, &edx); + return (ecx & (CPUID_VMWARE_FEATURES_ECX_VMMCALL | + CPUID_VMWARE_FEATURES_ECX_VMCALL)); +} + /* * While checking the dmi string information, just checking the product * serial key should be enough, as this will always have a VMware * specific string when running under VMware hypervisor. + * If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode + * intentionally defaults to 0. */ static uint32_t __init vmware_platform(void) { @@ -187,8 +237,16 @@ static uint32_t __init vmware_platform(void) cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0], &hyper_vendor_id[1], &hyper_vendor_id[2]); - if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) + if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) { + if (eax >= CPUID_VMWARE_FEATURES_LEAF) + vmware_hypercall_mode = + vmware_select_hypercall(); + + pr_info("hypercall mode: 0x%02x\n", + (unsigned int) vmware_hypercall_mode); + return CPUID_VMWARE_INFO_LEAF; + } } else if (dmi_available && dmi_name_in_serial("VMware") && __vmware_platform()) return 1; @@ -200,9 +258,9 @@ static uint32_t __init vmware_platform(void) static bool __init vmware_legacy_x2apic_available(void) { uint32_t eax, ebx, ecx, edx; - VMWARE_PORT(GETVCPU_INFO, eax, ebx, ecx, edx); - return (eax & (1 << VMWARE_PORT_CMD_VCPU_RESERVED)) == 0 && - (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0; + VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx); + return (eax & (1 << VMWARE_CMD_VCPU_RESERVED)) == 0 && + (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0; } const __initconst struct hypervisor_x86 x86_hyper_vmware = { diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 8e6f2f4b4afe..df1358ba622b 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -16,13 +16,6 @@ #define RNG_ENABLED (1 << 3) #define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */ -#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 -#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000 -#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000 -#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 -#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 -#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 - static void init_zhaoxin_cap(struct cpuinfo_x86 *c) { u32 lo, hi; @@ -58,8 +51,6 @@ static void init_zhaoxin_cap(struct cpuinfo_x86 *c) if (c->x86 >= 0x6) set_cpu_cap(c, X86_FEATURE_REP_GOOD); - - cpu_detect_cache_sizes(c); } static void early_init_zhaoxin(struct cpuinfo_x86 *c) @@ -89,31 +80,6 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c) } -static void zhaoxin_detect_vmx_virtcap(struct cpuinfo_x86 *c) -{ - u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; - - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); - msr_ctl = vmx_msr_high | vmx_msr_low; - - if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW) - set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); - if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI) - set_cpu_cap(c, X86_FEATURE_VNMI); - if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) { - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, - vmx_msr_low, vmx_msr_high); - msr_ctl2 = vmx_msr_high | vmx_msr_low; - if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && - (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) - set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); - if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) - set_cpu_cap(c, X86_FEATURE_EPT); - if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) - set_cpu_cap(c, X86_FEATURE_VPID); - } -} - static void init_zhaoxin(struct cpuinfo_x86 *c) { early_init_zhaoxin(c); @@ -141,8 +107,7 @@ static void init_zhaoxin(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); #endif - if (cpu_has(c, X86_FEATURE_VMX)) - zhaoxin_detect_vmx_virtcap(c); + init_ia32_feat_ctl(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 2bf70a2fed90..fd87b59452a3 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -24,6 +24,7 @@ #include <linux/export.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/memblock.h> #include <asm/processor.h> #include <asm/hardirq.h> @@ -39,6 +40,7 @@ #include <asm/virtext.h> #include <asm/intel_pt.h> #include <asm/crash.h> +#include <asm/cmdline.h> /* Used while preparing memory map entries for second kernel */ struct crash_memmap_data { @@ -68,6 +70,19 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void) rcu_read_unlock(); } +/* + * When the crashkernel option is specified, only use the low + * 1M for the real mode trampoline. + */ +void __init crash_reserve_low_1M(void) +{ + if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0) + return; + + memblock_reserve(0, 1<<20); + pr_info("Reserving the low 1M of memory for crashkernel\n"); +} + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) @@ -173,8 +188,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_KEXEC_FILE -static unsigned long crash_zero_bytes; - static int get_nr_ram_ranges_callback(struct resource *res, void *arg) { unsigned int *nr_ranges = arg; @@ -189,8 +202,7 @@ static struct crash_mem *fill_up_crash_elf_data(void) unsigned int nr_ranges = 0; struct crash_mem *cmem; - walk_system_ram_res(0, -1, &nr_ranges, - get_nr_ram_ranges_callback); + walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback); if (!nr_ranges) return NULL; @@ -217,17 +229,19 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem) { int ret = 0; + /* Exclude the low 1M because it is always reserved */ + ret = crash_exclude_mem_range(cmem, 0, 1<<20); + if (ret) + return ret; + /* Exclude crashkernel region */ ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); if (ret) return ret; - if (crashk_low_res.end) { + if (crashk_low_res.end) ret = crash_exclude_mem_range(cmem, crashk_low_res.start, - crashk_low_res.end); - if (ret) - return ret; - } + crashk_low_res.end); return ret; } @@ -248,16 +262,13 @@ static int prepare_elf_headers(struct kimage *image, void **addr, unsigned long *sz) { struct crash_mem *cmem; - Elf64_Ehdr *ehdr; - Elf64_Phdr *phdr; - int ret, i; + int ret; cmem = fill_up_crash_elf_data(); if (!cmem) return -ENOMEM; - ret = walk_system_ram_res(0, -1, cmem, - prepare_elf64_ram_headers_callback); + ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback); if (ret) goto out; @@ -267,24 +278,8 @@ static int prepare_elf_headers(struct kimage *image, void **addr, goto out; /* By default prepare 64bit headers */ - ret = crash_prepare_elf64_headers(cmem, - IS_ENABLED(CONFIG_X86_64), addr, sz); - if (ret) - goto out; + ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz); - /* - * If a range matches backup region, adjust offset to backup - * segment. - */ - ehdr = (Elf64_Ehdr *)*addr; - phdr = (Elf64_Phdr *)(ehdr + 1); - for (i = 0; i < ehdr->e_phnum; phdr++, i++) - if (phdr->p_type == PT_LOAD && - phdr->p_paddr == image->arch.backup_src_start && - phdr->p_memsz == image->arch.backup_src_sz) { - phdr->p_offset = image->arch.backup_load_addr; - break; - } out: vfree(cmem); return ret; @@ -298,8 +293,7 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE) return 1; - memcpy(¶ms->e820_table[nr_e820_entries], entry, - sizeof(struct e820_entry)); + memcpy(¶ms->e820_table[nr_e820_entries], entry, sizeof(struct e820_entry)); params->e820_entries++; return 0; } @@ -323,19 +317,11 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, unsigned long long mend) { unsigned long start, end; - int ret = 0; cmem->ranges[0].start = mstart; cmem->ranges[0].end = mend; cmem->nr_ranges = 1; - /* Exclude Backup region */ - start = image->arch.backup_load_addr; - end = start + image->arch.backup_src_sz - 1; - ret = crash_exclude_mem_range(cmem, start, end); - if (ret) - return ret; - /* Exclude elf header region */ start = image->arch.elf_load_addr; end = start + image->arch.elf_headers_sz - 1; @@ -358,40 +344,39 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) memset(&cmd, 0, sizeof(struct crash_memmap_data)); cmd.params = params; - /* Add first 640K segment */ - ei.addr = image->arch.backup_src_start; - ei.size = image->arch.backup_src_sz; - ei.type = E820_TYPE_RAM; - add_e820_entry(params, &ei); + /* Add the low 1M */ + cmd.type = E820_TYPE_RAM; + flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + walk_iomem_res_desc(IORES_DESC_NONE, flags, 0, (1<<20)-1, &cmd, + memmap_entry_callback); /* Add ACPI tables */ cmd.type = E820_TYPE_ACPI; flags = IORESOURCE_MEM | IORESOURCE_BUSY; walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd, - memmap_entry_callback); + memmap_entry_callback); /* Add ACPI Non-volatile Storage */ cmd.type = E820_TYPE_NVS; walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, - memmap_entry_callback); + memmap_entry_callback); /* Add e820 reserved ranges */ cmd.type = E820_TYPE_RESERVED; flags = IORESOURCE_MEM; walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd, - memmap_entry_callback); + memmap_entry_callback); /* Add crashk_low_res region */ if (crashk_low_res.end) { ei.addr = crashk_low_res.start; - ei.size = crashk_low_res.end - crashk_low_res.start + 1; + ei.size = resource_size(&crashk_low_res); ei.type = E820_TYPE_RAM; add_e820_entry(params, &ei); } /* Exclude some ranges from crashk_res and add rest to memmap */ - ret = memmap_exclude_ranges(image, cmem, crashk_res.start, - crashk_res.end); + ret = memmap_exclude_ranges(image, cmem, crashk_res.start, crashk_res.end); if (ret) goto out; @@ -411,55 +396,12 @@ out: return ret; } -static int determine_backup_region(struct resource *res, void *arg) -{ - struct kimage *image = arg; - - image->arch.backup_src_start = res->start; - image->arch.backup_src_sz = resource_size(res); - - /* Expecting only one range for backup region */ - return 1; -} - int crash_load_segments(struct kimage *image) { int ret; struct kexec_buf kbuf = { .image = image, .buf_min = 0, .buf_max = ULONG_MAX, .top_down = false }; - /* - * Determine and load a segment for backup area. First 640K RAM - * region is backup source - */ - - ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END, - image, determine_backup_region); - - /* Zero or postive return values are ok */ - if (ret < 0) - return ret; - - /* Add backup segment. */ - if (image->arch.backup_src_sz) { - kbuf.buffer = &crash_zero_bytes; - kbuf.bufsz = sizeof(crash_zero_bytes); - kbuf.memsz = image->arch.backup_src_sz; - kbuf.buf_align = PAGE_SIZE; - /* - * Ideally there is no source for backup segment. This is - * copied in purgatory after crash. Just add a zero filled - * segment for now to make sure checksum logic works fine. - */ - ret = kexec_add_buffer(&kbuf); - if (ret) - return ret; - image->arch.backup_load_addr = kbuf.mem; - pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n", - image->arch.backup_load_addr, - image->arch.backup_src_start, kbuf.memsz); - } - /* Prepare elf headers and add a segment */ ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz); if (ret) diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/crash_core_32.c new file mode 100644 index 000000000000..c0159a7bca6d --- /dev/null +++ b/arch/x86/kernel/crash_core_32.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/crash_core.h> + +#include <asm/pgtable.h> +#include <asm/setup.h> + +void arch_crash_save_vmcoreinfo(void) +{ +#ifdef CONFIG_NUMA + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif +#ifdef CONFIG_X86_PAE + VMCOREINFO_CONFIG(X86_PAE); +#endif +} diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/crash_core_64.c new file mode 100644 index 000000000000..845a57eb4eb7 --- /dev/null +++ b/arch/x86/kernel/crash_core_64.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/crash_core.h> + +#include <asm/pgtable.h> +#include <asm/setup.h> + +void arch_crash_save_vmcoreinfo(void) +{ + u64 sme_mask = sme_me_mask; + + VMCOREINFO_NUMBER(phys_base); + VMCOREINFO_SYMBOL(init_top_pgt); + vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n", + pgtable_l5_enabled()); + +#ifdef CONFIG_NUMA + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); + VMCOREINFO_NUMBER(sme_mask); +} diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 22369dd5de3b..045e82e8945b 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -70,3 +70,8 @@ ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, { return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true); } + +ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + return read_from_oldmem(buf, count, ppos, 0, sev_active()); +} diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c deleted file mode 100644 index 0b8cedb20d6d..000000000000 --- a/arch/x86/kernel/doublefault.c +++ /dev/null @@ -1,83 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/sched/debug.h> -#include <linux/init_task.h> -#include <linux/fs.h> - -#include <linux/uaccess.h> -#include <asm/pgtable.h> -#include <asm/processor.h> -#include <asm/desc.h> - -#ifdef CONFIG_X86_32 - -#define DOUBLEFAULT_STACKSIZE (1024) -static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; -#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) - -#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) - -static void doublefault_fn(void) -{ - struct desc_ptr gdt_desc = {0, 0}; - unsigned long gdt, tss; - - native_store_gdt(&gdt_desc); - gdt = gdt_desc.address; - - printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); - - if (ptr_ok(gdt)) { - gdt += GDT_ENTRY_TSS << 3; - tss = get_desc_base((struct desc_struct *)gdt); - printk(KERN_EMERG "double fault, tss at %08lx\n", tss); - - if (ptr_ok(tss)) { - struct x86_hw_tss *t = (struct x86_hw_tss *)tss; - - printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", - t->ip, t->sp); - - printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", - t->ax, t->bx, t->cx, t->dx); - printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", - t->si, t->di); - } - } - - for (;;) - cpu_relax(); -} - -struct x86_hw_tss doublefault_tss __cacheline_aligned = { - .sp0 = STACK_START, - .ss0 = __KERNEL_DS, - .ldt = 0, - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, - - .ip = (unsigned long) doublefault_fn, - /* 0x2 bit is always set */ - .flags = X86_EFLAGS_SF | 0x2, - .sp = STACK_START, - .es = __USER_DS, - .cs = __KERNEL_CS, - .ss = __KERNEL_DS, - .ds = __USER_DS, - .fs = __KERNEL_PERCPU, - - .__cr3 = __pa_nodebug(swapper_pg_dir), -}; - -/* dummy for do_double_fault() call */ -void df_debug(struct pt_regs *regs, long error_code) {} - -#else /* !CONFIG_X86_32 */ - -void df_debug(struct pt_regs *regs, long error_code) -{ - pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); - show_regs(regs); - panic("Machine halted."); -} -#endif diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c new file mode 100644 index 000000000000..3793646f0fb5 --- /dev/null +++ b/arch/x86/kernel/doublefault_32.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/init_task.h> +#include <linux/fs.h> + +#include <linux/uaccess.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/desc.h> +#include <asm/traps.h> + +extern void double_fault(void); +#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) + +#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x) + +static void set_df_gdt_entry(unsigned int cpu); + +/* + * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks + * we're running the doublefault task. Cannot return. + */ +asmlinkage notrace void __noreturn doublefault_shim(void) +{ + unsigned long cr2; + struct pt_regs regs; + + BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE); + + cr2 = native_read_cr2(); + + /* Reset back to the normal kernel task. */ + force_reload_TR(); + set_df_gdt_entry(smp_processor_id()); + + trace_hardirqs_off(); + + /* + * Fill in pt_regs. A downside of doing this in C is that the unwinder + * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump + * won't successfully unwind to the source of the double fault. + * The main dump from do_double_fault() is fine, though, since it + * uses these regs directly. + * + * If anyone ever cares, this could be moved to asm. + */ + regs.ss = TSS(ss); + regs.__ssh = 0; + regs.sp = TSS(sp); + regs.flags = TSS(flags); + regs.cs = TSS(cs); + /* We won't go through the entry asm, so we can leave __csh as 0. */ + regs.__csh = 0; + regs.ip = TSS(ip); + regs.orig_ax = 0; + regs.gs = TSS(gs); + regs.__gsh = 0; + regs.fs = TSS(fs); + regs.__fsh = 0; + regs.es = TSS(es); + regs.__esh = 0; + regs.ds = TSS(ds); + regs.__dsh = 0; + regs.ax = TSS(ax); + regs.bp = TSS(bp); + regs.di = TSS(di); + regs.si = TSS(si); + regs.dx = TSS(dx); + regs.cx = TSS(cx); + regs.bx = TSS(bx); + + do_double_fault(®s, 0, cr2); + + /* + * x86_32 does not save the original CR3 anywhere on a task switch. + * This means that, even if we wanted to return, we would need to find + * some way to reconstruct CR3. We could make a credible guess based + * on cpu_tlbstate, but that would be racy and would not account for + * PTI. + * + * Instead, don't bother. We can return through + * rewind_stack_do_exit() instead. + */ + panic("cannot return from double fault\n"); +} +NOKPROBE_SYMBOL(doublefault_shim); + +DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { + .tss = { + /* + * No sp0 or ss0 -- we never run CPL != 0 with this TSS + * active. sp is filled in later. + */ + .ldt = 0, + .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, + + .ip = (unsigned long) double_fault, + .flags = X86_EFLAGS_FIXED, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + .fs = __KERNEL_PERCPU, +#ifndef CONFIG_X86_32_LAZY_GS + .gs = __KERNEL_STACK_CANARY, +#endif + + .__cr3 = __pa_nodebug(swapper_pg_dir), + }, +}; + +static void set_df_gdt_entry(unsigned int cpu) +{ + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, + &get_cpu_entry_area(cpu)->doublefault_stack.tss); + +} + +void doublefault_init_cpu_tss(void) +{ + unsigned int cpu = smp_processor_id(); + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + + /* + * The linker isn't smart enough to initialize percpu variables that + * point to other places in percpu space. + */ + this_cpu_write(doublefault_stack.tss.sp, + (unsigned long)&cea->doublefault_stack.stack + + sizeof(doublefault_stack.stack)); + + set_df_gdt_entry(cpu); +} diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 2b5886401e5f..ae64ec7f752f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -365,21 +365,30 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) } NOKPROBE_SYMBOL(oops_end); -int __die(const char *str, struct pt_regs *regs, long err) +static void __die_header(const char *str, struct pt_regs *regs, long err) { + const char *pr = ""; + /* Save the regs of the first oops for the executive summary later. */ if (!die_counter) exec_summary_regs = *regs; + if (IS_ENABLED(CONFIG_PREEMPTION)) + pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; + printk(KERN_DEFAULT "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, - IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + pr, IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); +} +NOKPROBE_SYMBOL(__die_header); +static int __die_body(const char *str, struct pt_regs *regs, long err) +{ show_regs(regs); print_modules(); @@ -389,6 +398,13 @@ int __die(const char *str, struct pt_regs *regs, long err) return 0; } +NOKPROBE_SYMBOL(__die_body); + +int __die(const char *str, struct pt_regs *regs, long err) +{ + __die_header(str, regs, err); + return __die_body(str, regs, err); +} NOKPROBE_SYMBOL(__die); /* @@ -405,6 +421,19 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } +void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr) +{ + unsigned long flags = oops_begin(); + int sig = SIGSEGV; + + __die_header(str, regs, err); + if (gp_addr) + kasan_non_canonical_hook(gp_addr); + if (__die_body(str, regs, err)) + sig = 0; + oops_end(flags, regs, sig); +} + void show_regs(struct pt_regs *regs) { show_regs_print_info(KERN_DEFAULT); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 64a59d726639..8e3a8fedfa4d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -29,6 +29,9 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_ENTRY) return "ENTRY_TRAMPOLINE"; + if (type == STACK_TYPE_EXCEPTION) + return "#DF"; + return NULL; } @@ -82,6 +85,30 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) return true; } +static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) +{ +#ifdef CONFIG_DOUBLEFAULT + struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id()); + struct doublefault_stack *ss = &cea->doublefault_stack; + + void *begin = ss->stack; + void *end = begin + sizeof(ss->stack); + + if ((void *)stack < begin || (void *)stack >= end) + return false; + + info->type = STACK_TYPE_EXCEPTION; + info->begin = begin; + info->end = end; + info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp); + + return true; +#else + return false; +#endif +} + + int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask) { @@ -105,6 +132,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (in_softirq_stack(stack, info)) goto recursion_check; + if (in_doublefault_stack(stack, info)) + goto recursion_check; + goto unknown; recursion_check: diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 753b8cfe8b8a..87b97897a881 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -94,6 +94,13 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); begin = (unsigned long)__this_cpu_read(cea_exception_stacks); + /* + * Handle the case where stack trace is collected _before_ + * cea_exception_stacks had been initialized. + */ + if (!begin) + return false; + end = begin + sizeof(struct cea_exception_stacks); /* Bail if @stack is outside the exception stack area. */ if (stk < begin || stk >= end) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7da2bcd2b8eb..c5399e80c59c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -190,6 +190,7 @@ static void __init e820_print_type(enum e820_type type) case E820_TYPE_RAM: /* Fall through: */ case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break; case E820_TYPE_RESERVED: pr_cont("reserved"); break; + case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break; case E820_TYPE_ACPI: pr_cont("ACPI data"); break; case E820_TYPE_NVS: pr_cont("ACPI NVS"); break; case E820_TYPE_UNUSABLE: pr_cont("unusable"); break; @@ -999,6 +1000,17 @@ void __init e820__reserve_setup_data(void) data = early_memremap(pa_data, sizeof(*data)); e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { + e820__range_update(((struct setup_indirect *)data->data)->addr, + ((struct setup_indirect *)data->data)->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + e820__range_update_kexec(((struct setup_indirect *)data->data)->addr, + ((struct setup_indirect *)data->data)->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + } + pa_data = data->next; early_memunmap(data, sizeof(*data)); } @@ -1037,6 +1049,7 @@ static const char *__init e820_type_to_string(struct e820_entry *entry) case E820_TYPE_PRAM: return "Persistent Memory (legacy)"; case E820_TYPE_PMEM: return "Persistent Memory"; case E820_TYPE_RESERVED: return "Reserved"; + case E820_TYPE_SOFT_RESERVED: return "Soft Reserved"; default: return "Unknown E820 type"; } } @@ -1052,6 +1065,7 @@ static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) case E820_TYPE_PRAM: /* Fall-through: */ case E820_TYPE_PMEM: /* Fall-through: */ case E820_TYPE_RESERVED: /* Fall-through: */ + case E820_TYPE_SOFT_RESERVED: /* Fall-through: */ default: return IORESOURCE_MEM; } } @@ -1064,6 +1078,7 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY; case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; case E820_TYPE_RESERVED: return IORES_DESC_RESERVED; + case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED; case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: /* Fall-through: */ case E820_TYPE_UNUSABLE: /* Fall-through: */ @@ -1078,11 +1093,12 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res) return true; /* - * Treat persistent memory like device memory, i.e. reserve it - * for exclusive use of a driver + * Treat persistent memory and other special memory ranges like + * device memory, i.e. reserve it for exclusive use of a driver */ switch (type) { case E820_TYPE_RESERVED: + case E820_TYPE_SOFT_RESERVED: case E820_TYPE_PRAM: case E820_TYPE_PMEM: return false; @@ -1285,6 +1301,9 @@ void __init e820__memblock_setup(void) if (end != (resource_size_t)end) continue; + if (entry->type == E820_TYPE_SOFT_RESERVED) + memblock_reserve(entry->addr, entry->size); + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) continue; diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6c4f01540833..2f9ec14be3b1 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -549,6 +549,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_CNL_IDS(&gen9_early_ops), INTEL_ICL_11_IDS(&gen11_early_ops), INTEL_EHL_IDS(&gen11_early_ops), + INTEL_TGL_12_IDS(&gen11_early_ops), }; struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); @@ -709,6 +710,12 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_INTEL, 0x3e20, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_INTEL, 0x3ec4, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_INTEL, 0x8a12, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, { PCI_VENDOR_ID_BROADCOM, 0x4331, PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 0071b794ed19..400a05e1c1c5 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -352,6 +352,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpregs_unlock(); return 0; } + fpregs_deactivate(fpu); fpregs_unlock(); } @@ -403,6 +404,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } if (!ret) fpregs_mark_activate(); + else + fpregs_deactivate(fpu); fpregs_unlock(); err_out: diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index e5cb67d67c03..a1806598aaa4 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -60,7 +60,7 @@ u64 xfeatures_mask __read_mostly; static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; -static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; +static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; /* * The XSAVE area of kernel can be in standard or compacted format; @@ -107,23 +107,20 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) } EXPORT_SYMBOL_GPL(cpu_has_xfeatures); -static int xfeature_is_supervisor(int xfeature_nr) +static bool xfeature_is_supervisor(int xfeature_nr) { /* - * We currently do not support supervisor states, but if - * we did, we could find out like this. - * - * SDM says: If state component 'i' is a user state component, - * ECX[0] return 0; if state component i is a supervisor - * state component, ECX[0] returns 1. + * Extended State Enumeration Sub-leaves (EAX = 0DH, ECX = n, n > 1) + * returns ECX[0] set to (1) for a supervisor state, and cleared (0) + * for a user state. */ u32 eax, ebx, ecx, edx; cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); - return !!(ecx & 1); + return ecx & 1; } -static int xfeature_is_user(int xfeature_nr) +static bool xfeature_is_user(int xfeature_nr) { return !xfeature_is_supervisor(xfeature_nr); } @@ -254,10 +251,13 @@ static void __init setup_xstate_features(void) * in the fixed offsets in the xsave area in either compacted form * or standard form. */ - xstate_offsets[0] = 0; - xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space); - xstate_offsets[1] = xstate_sizes[0]; - xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space); + xstate_offsets[XFEATURE_FP] = 0; + xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, + xmm_space); + + xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; + xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, + xmm_space); for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (!xfeature_enabled(i)) @@ -342,7 +342,7 @@ static int xfeature_is_aligned(int xfeature_nr) */ static void __init setup_xstate_comp(void) { - unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8]; + unsigned int xstate_comp_sizes[XFEATURE_MAX]; int i; /* @@ -350,8 +350,9 @@ static void __init setup_xstate_comp(void) * in the fixed offsets in the xsave area in either compacted form * or standard form. */ - xstate_comp_offsets[0] = 0; - xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space); + xstate_comp_offsets[XFEATURE_FP] = 0; + xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state, + xmm_space); if (!boot_cpu_has(X86_FEATURE_XSAVES)) { for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { @@ -415,7 +416,8 @@ static void __init setup_init_fpu_buf(void) print_xstate_features(); if (boot_cpu_has(X86_FEATURE_XSAVES)) - init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; + init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | + xfeatures_mask; /* * Init all the features state with header.xfeatures being 0x0 @@ -840,7 +842,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) /* * We should not ever be requesting features that we - * have not enabled. Remember that pcntxt_mask is + * have not enabled. Remember that xfeatures_mask is * what we write to the XCR0 register. */ WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)), diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 024c3053dbba..37a0aeaf89e7 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -23,6 +23,7 @@ #include <linux/list.h> #include <linux/module.h> #include <linux/memory.h> +#include <linux/vmalloc.h> #include <trace/syscall.h> @@ -34,6 +35,8 @@ #ifdef CONFIG_DYNAMIC_FTRACE +static int ftrace_poke_late = 0; + int ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) { @@ -43,84 +46,37 @@ int ftrace_arch_code_modify_prepare(void) * ftrace has it set to "read/write". */ mutex_lock(&text_mutex); - set_kernel_text_rw(); - set_all_modules_text_rw(); + ftrace_poke_late = 1; return 0; } int ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) { - set_all_modules_text_ro(); - set_kernel_text_ro(); + /* + * ftrace_make_{call,nop}() may be called during + * module load, and we need to finish the text_poke_queue() + * that they do, here. + */ + text_poke_finish(); + ftrace_poke_late = 0; mutex_unlock(&text_mutex); return 0; } -union ftrace_code_union { - char code[MCOUNT_INSN_SIZE]; - struct { - unsigned char op; - int offset; - } __attribute__((packed)); -}; - -static int ftrace_calc_offset(long ip, long addr) -{ - return (int)(addr - ip); -} - -static unsigned char * -ftrace_text_replace(unsigned char op, unsigned long ip, unsigned long addr) +static const char *ftrace_nop_replace(void) { - static union ftrace_code_union calc; - - calc.op = op; - calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr); - - return calc.code; -} - -static unsigned char * -ftrace_call_replace(unsigned long ip, unsigned long addr) -{ - return ftrace_text_replace(0xe8, ip, addr); -} - -static inline int -within(unsigned long addr, unsigned long start, unsigned long end) -{ - return addr >= start && addr < end; + return ideal_nops[NOP_ATOMIC5]; } -static unsigned long text_ip_addr(unsigned long ip) +static const char *ftrace_call_replace(unsigned long ip, unsigned long addr) { - /* - * On x86_64, kernel text mappings are mapped read-only, so we use - * the kernel identity mapping instead of the kernel text mapping - * to modify the kernel text. - * - * For 32bit kernels, these mappings are same and we can use - * kernel identity mapping to modify code. - */ - if (within(ip, (unsigned long)_text, (unsigned long)_etext)) - ip = (unsigned long)__va(__pa_symbol(ip)); - - return ip; + return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr); } -static const unsigned char *ftrace_nop_replace(void) +static int ftrace_verify_code(unsigned long ip, const char *old_code) { - return ideal_nops[NOP_ATOMIC5]; -} - -static int -ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, - unsigned const char *new_code) -{ - unsigned char replaced[MCOUNT_INSN_SIZE]; - - ftrace_expected = old_code; + char cur_code[MCOUNT_INSN_SIZE]; /* * Note: @@ -129,31 +85,46 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, * Carefully read and modify the code with probe_kernel_*(), and make * sure what we read is what we expected it to be before modifying it. */ - /* read the text we want to modify */ - if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + if (probe_kernel_read(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) { + WARN_ON(1); return -EFAULT; + } /* Make sure it is what we expect it to be */ - if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) + if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) { + WARN_ON(1); return -EINVAL; + } - ip = text_ip_addr(ip); - - /* replace the text with the new text */ - if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) - return -EPERM; + return 0; +} - sync_core(); +/* + * Marked __ref because it calls text_poke_early() which is .init.text. That is + * ok because that call will happen early, during boot, when .init sections are + * still present. + */ +static int __ref +ftrace_modify_code_direct(unsigned long ip, const char *old_code, + const char *new_code) +{ + int ret = ftrace_verify_code(ip, old_code); + if (ret) + return ret; + /* replace the text with the new text */ + if (ftrace_poke_late) + text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); + else + text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); return 0; } -int ftrace_make_nop(struct module *mod, - struct dyn_ftrace *rec, unsigned long addr) +int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - unsigned const char *new, *old; unsigned long ip = rec->ip; + const char *new, *old; old = ftrace_call_replace(ip, addr); new = ftrace_nop_replace(); @@ -167,19 +138,20 @@ int ftrace_make_nop(struct module *mod, * just modify the code directly. */ if (addr == MCOUNT_ADDR) - return ftrace_modify_code_direct(rec->ip, old, new); + return ftrace_modify_code_direct(ip, old, new); - ftrace_expected = NULL; - - /* Normal cases use add_brk_on_nop */ + /* + * x86 overrides ftrace_replace_code -- this function will never be used + * in this case. + */ WARN_ONCE(1, "invalid use of ftrace_make_nop"); return -EINVAL; } int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - unsigned const char *new, *old; unsigned long ip = rec->ip; + const char *new, *old; old = ftrace_nop_replace(); new = ftrace_call_replace(ip, addr); @@ -189,43 +161,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) } /* - * The modifying_ftrace_code is used to tell the breakpoint - * handler to call ftrace_int3_handler(). If it fails to - * call this handler for a breakpoint added by ftrace, then - * the kernel may crash. - * - * As atomic_writes on x86 do not need a barrier, we do not - * need to add smp_mb()s for this to work. It is also considered - * that we can not read the modifying_ftrace_code before - * executing the breakpoint. That would be quite remarkable if - * it could do that. Here's the flow that is required: - * - * CPU-0 CPU-1 - * - * atomic_inc(mfc); - * write int3s - * <trap-int3> // implicit (r)mb - * if (atomic_read(mfc)) - * call ftrace_int3_handler() - * - * Then when we are finished: - * - * atomic_dec(mfc); - * - * If we hit a breakpoint that was not set by ftrace, it does not - * matter if ftrace_int3_handler() is called or not. It will - * simply be ignored. But it is crucial that a ftrace nop/caller - * breakpoint is handled. No other user should ever place a - * breakpoint on an ftrace nop/caller location. It must only - * be done by this code. - */ -atomic_t modifying_ftrace_code __read_mostly; - -static int -ftrace_modify_code(unsigned long ip, unsigned const char *old_code, - unsigned const char *new_code); - -/* * Should never be called: * As it is only called by __ftrace_replace_code() which is called by * ftrace_replace_code() that x86 overrides, and by ftrace_update_code() @@ -237,452 +172,84 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { WARN_ON(1); - ftrace_expected = NULL; return -EINVAL; } -static unsigned long ftrace_update_func; -static unsigned long ftrace_update_func_call; - -static int update_ftrace_func(unsigned long ip, void *new) -{ - unsigned char old[MCOUNT_INSN_SIZE]; - int ret; - - memcpy(old, (void *)ip, MCOUNT_INSN_SIZE); - - ftrace_update_func = ip; - /* Make sure the breakpoints see the ftrace_update_func update */ - smp_wmb(); - - /* See comment above by declaration of modifying_ftrace_code */ - atomic_inc(&modifying_ftrace_code); - - ret = ftrace_modify_code(ip, old, new); - - atomic_dec(&modifying_ftrace_code); - - return ret; -} - int ftrace_update_ftrace_func(ftrace_func_t func) { - unsigned long ip = (unsigned long)(&ftrace_call); - unsigned char *new; - int ret; - - ftrace_update_func_call = (unsigned long)func; - - new = ftrace_call_replace(ip, (unsigned long)func); - ret = update_ftrace_func(ip, new); - - /* Also update the regs callback function */ - if (!ret) { - ip = (unsigned long)(&ftrace_regs_call); - new = ftrace_call_replace(ip, (unsigned long)func); - ret = update_ftrace_func(ip, new); - } - - return ret; -} - -static nokprobe_inline int is_ftrace_caller(unsigned long ip) -{ - if (ip == ftrace_update_func) - return 1; - - return 0; -} - -/* - * A breakpoint was added to the code address we are about to - * modify, and this is the handle that will just skip over it. - * We are either changing a nop into a trace call, or a trace - * call to a nop. While the change is taking place, we treat - * it just like it was a nop. - */ -int ftrace_int3_handler(struct pt_regs *regs) -{ unsigned long ip; + const char *new; - if (WARN_ON_ONCE(!regs)) - return 0; - - ip = regs->ip - INT3_INSN_SIZE; - - if (ftrace_location(ip)) { - int3_emulate_call(regs, (unsigned long)ftrace_regs_caller); - return 1; - } else if (is_ftrace_caller(ip)) { - if (!ftrace_update_func_call) { - int3_emulate_jmp(regs, ip + CALL_INSN_SIZE); - return 1; - } - int3_emulate_call(regs, ftrace_update_func_call); - return 1; - } - - return 0; -} -NOKPROBE_SYMBOL(ftrace_int3_handler); - -static int ftrace_write(unsigned long ip, const char *val, int size) -{ - ip = text_ip_addr(ip); - - if (probe_kernel_write((void *)ip, val, size)) - return -EPERM; - - return 0; -} - -static int add_break(unsigned long ip, const char *old) -{ - unsigned char replaced[MCOUNT_INSN_SIZE]; - unsigned char brk = BREAKPOINT_INSTRUCTION; - - if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) - return -EFAULT; - - ftrace_expected = old; - - /* Make sure it is what we expect it to be */ - if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) - return -EINVAL; - - return ftrace_write(ip, &brk, 1); -} - -static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) -{ - unsigned const char *old; - unsigned long ip = rec->ip; - - old = ftrace_call_replace(ip, addr); - - return add_break(rec->ip, old); -} - - -static int add_brk_on_nop(struct dyn_ftrace *rec) -{ - unsigned const char *old; - - old = ftrace_nop_replace(); - - return add_break(rec->ip, old); -} - -static int add_breakpoints(struct dyn_ftrace *rec, bool enable) -{ - unsigned long ftrace_addr; - int ret; - - ftrace_addr = ftrace_get_addr_curr(rec); - - ret = ftrace_test_record(rec, enable); - - switch (ret) { - case FTRACE_UPDATE_IGNORE: - return 0; - - case FTRACE_UPDATE_MAKE_CALL: - /* converting nop to call */ - return add_brk_on_nop(rec); - - case FTRACE_UPDATE_MODIFY_CALL: - case FTRACE_UPDATE_MAKE_NOP: - /* converting a call to a nop */ - return add_brk_on_call(rec, ftrace_addr); - } - return 0; -} - -/* - * On error, we need to remove breakpoints. This needs to - * be done caefully. If the address does not currently have a - * breakpoint, we know we are done. Otherwise, we look at the - * remaining 4 bytes of the instruction. If it matches a nop - * we replace the breakpoint with the nop. Otherwise we replace - * it with the call instruction. - */ -static int remove_breakpoint(struct dyn_ftrace *rec) -{ - unsigned char ins[MCOUNT_INSN_SIZE]; - unsigned char brk = BREAKPOINT_INSTRUCTION; - const unsigned char *nop; - unsigned long ftrace_addr; - unsigned long ip = rec->ip; - - /* If we fail the read, just give up */ - if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE)) - return -EFAULT; - - /* If this does not have a breakpoint, we are done */ - if (ins[0] != brk) - return 0; - - nop = ftrace_nop_replace(); - - /* - * If the last 4 bytes of the instruction do not match - * a nop, then we assume that this is a call to ftrace_addr. - */ - if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) { - /* - * For extra paranoidism, we check if the breakpoint is on - * a call that would actually jump to the ftrace_addr. - * If not, don't touch the breakpoint, we make just create - * a disaster. - */ - ftrace_addr = ftrace_get_addr_new(rec); - nop = ftrace_call_replace(ip, ftrace_addr); - - if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0) - goto update; - - /* Check both ftrace_addr and ftrace_old_addr */ - ftrace_addr = ftrace_get_addr_curr(rec); - nop = ftrace_call_replace(ip, ftrace_addr); - - ftrace_expected = nop; - - if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) - return -EINVAL; - } - - update: - return ftrace_write(ip, nop, 1); -} - -static int add_update_code(unsigned long ip, unsigned const char *new) -{ - /* skip breakpoint */ - ip++; - new++; - return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1); -} - -static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) -{ - unsigned long ip = rec->ip; - unsigned const char *new; - - new = ftrace_call_replace(ip, addr); - return add_update_code(ip, new); -} - -static int add_update_nop(struct dyn_ftrace *rec) -{ - unsigned long ip = rec->ip; - unsigned const char *new; - - new = ftrace_nop_replace(); - return add_update_code(ip, new); -} - -static int add_update(struct dyn_ftrace *rec, bool enable) -{ - unsigned long ftrace_addr; - int ret; - - ret = ftrace_test_record(rec, enable); - - ftrace_addr = ftrace_get_addr_new(rec); - - switch (ret) { - case FTRACE_UPDATE_IGNORE: - return 0; - - case FTRACE_UPDATE_MODIFY_CALL: - case FTRACE_UPDATE_MAKE_CALL: - /* converting nop to call */ - return add_update_call(rec, ftrace_addr); - - case FTRACE_UPDATE_MAKE_NOP: - /* converting a call to a nop */ - return add_update_nop(rec); - } - - return 0; -} - -static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) -{ - unsigned long ip = rec->ip; - unsigned const char *new; - - new = ftrace_call_replace(ip, addr); - - return ftrace_write(ip, new, 1); -} - -static int finish_update_nop(struct dyn_ftrace *rec) -{ - unsigned long ip = rec->ip; - unsigned const char *new; - - new = ftrace_nop_replace(); - - return ftrace_write(ip, new, 1); -} - -static int finish_update(struct dyn_ftrace *rec, bool enable) -{ - unsigned long ftrace_addr; - int ret; - - ret = ftrace_update_record(rec, enable); - - ftrace_addr = ftrace_get_addr_new(rec); - - switch (ret) { - case FTRACE_UPDATE_IGNORE: - return 0; - - case FTRACE_UPDATE_MODIFY_CALL: - case FTRACE_UPDATE_MAKE_CALL: - /* converting nop to call */ - return finish_update_call(rec, ftrace_addr); + ip = (unsigned long)(&ftrace_call); + new = ftrace_call_replace(ip, (unsigned long)func); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); - case FTRACE_UPDATE_MAKE_NOP: - /* converting a call to a nop */ - return finish_update_nop(rec); - } + ip = (unsigned long)(&ftrace_regs_call); + new = ftrace_call_replace(ip, (unsigned long)func); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); return 0; } -static void do_sync_core(void *data) -{ - sync_core(); -} - -static void run_sync(void) -{ - int enable_irqs; - - /* No need to sync if there's only one CPU */ - if (num_online_cpus() == 1) - return; - - enable_irqs = irqs_disabled(); - - /* We may be called with interrupts disabled (on bootup). */ - if (enable_irqs) - local_irq_enable(); - on_each_cpu(do_sync_core, NULL, 1); - if (enable_irqs) - local_irq_disable(); -} - void ftrace_replace_code(int enable) { struct ftrace_rec_iter *iter; struct dyn_ftrace *rec; - const char *report = "adding breakpoints"; - int count = 0; + const char *new, *old; int ret; for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); - ret = add_breakpoints(rec, enable); - if (ret) - goto remove_breakpoints; - count++; - } - - run_sync(); + switch (ftrace_test_record(rec, enable)) { + case FTRACE_UPDATE_IGNORE: + default: + continue; - report = "updating code"; - count = 0; + case FTRACE_UPDATE_MAKE_CALL: + old = ftrace_nop_replace(); + break; - for_ftrace_rec_iter(iter) { - rec = ftrace_rec_iter_record(iter); + case FTRACE_UPDATE_MODIFY_CALL: + case FTRACE_UPDATE_MAKE_NOP: + old = ftrace_call_replace(rec->ip, ftrace_get_addr_curr(rec)); + break; + } - ret = add_update(rec, enable); - if (ret) - goto remove_breakpoints; - count++; + ret = ftrace_verify_code(rec->ip, old); + if (ret) { + ftrace_bug(ret, rec); + return; + } } - run_sync(); - - report = "removing breakpoints"; - count = 0; - for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); - ret = finish_update(rec, enable); - if (ret) - goto remove_breakpoints; - count++; - } + switch (ftrace_test_record(rec, enable)) { + case FTRACE_UPDATE_IGNORE: + default: + continue; - run_sync(); + case FTRACE_UPDATE_MAKE_CALL: + case FTRACE_UPDATE_MODIFY_CALL: + new = ftrace_call_replace(rec->ip, ftrace_get_addr_new(rec)); + break; - return; + case FTRACE_UPDATE_MAKE_NOP: + new = ftrace_nop_replace(); + break; + } - remove_breakpoints: - pr_warn("Failed on %s (%d):\n", report, count); - ftrace_bug(ret, rec); - for_ftrace_rec_iter(iter) { - rec = ftrace_rec_iter_record(iter); - /* - * Breakpoints are handled only when this function is in - * progress. The system could not work with them. - */ - if (remove_breakpoint(rec)) - BUG(); + text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); + ftrace_update_record(rec, enable); } - run_sync(); -} - -static int -ftrace_modify_code(unsigned long ip, unsigned const char *old_code, - unsigned const char *new_code) -{ - int ret; - - ret = add_break(ip, old_code); - if (ret) - goto out; - - run_sync(); - - ret = add_update_code(ip, new_code); - if (ret) - goto fail_update; - - run_sync(); - - ret = ftrace_write(ip, new_code, 1); - /* - * The breakpoint is handled only when this function is in progress. - * The system could not work if we could not remove it. - */ - BUG_ON(ret); - out: - run_sync(); - return ret; - - fail_update: - /* Also here the system could not work with the breakpoint */ - if (ftrace_write(ip, old_code, 1)) - BUG(); - goto out; + text_poke_finish(); } void arch_ftrace_update_code(int command) { - /* See comment above by declaration of modifying_ftrace_code */ - atomic_inc(&modifying_ftrace_code); - ftrace_modify_all_code(command); - - atomic_dec(&modifying_ftrace_code); } int __init ftrace_dyn_arch_init(void) @@ -747,6 +314,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long start_offset; unsigned long end_offset; unsigned long op_offset; + unsigned long call_offset; unsigned long offset; unsigned long npages; unsigned long size; @@ -763,10 +331,12 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) start_offset = (unsigned long)ftrace_regs_caller; end_offset = (unsigned long)ftrace_regs_caller_end; op_offset = (unsigned long)ftrace_regs_caller_op_ptr; + call_offset = (unsigned long)ftrace_regs_call; } else { start_offset = (unsigned long)ftrace_caller; end_offset = (unsigned long)ftrace_epilogue; op_offset = (unsigned long)ftrace_caller_op_ptr; + call_offset = (unsigned long)ftrace_call; } size = end_offset - start_offset; @@ -823,16 +393,21 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* put in the new offset to the ftrace_ops */ memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + /* put in the call to the function */ + mutex_lock(&text_mutex); + call_offset -= start_offset; + memcpy(trampoline + call_offset, + text_gen_insn(CALL_INSN_OPCODE, + trampoline + call_offset, + ftrace_ops_get_func(ops)), CALL_INSN_SIZE); + mutex_unlock(&text_mutex); + /* ALLOC_TRAMP flags lets us know we created it */ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; set_vm_flush_reset_perms(trampoline); - /* - * Module allocation needs to be completed by making the page - * executable. The page is still writable, which is a security hazard, - * but anyhow ftrace breaks W^X completely. - */ + set_memory_ro((unsigned long)trampoline, npages); set_memory_x((unsigned long)trampoline, npages); return (unsigned long)trampoline; fail: @@ -859,62 +434,54 @@ static unsigned long calc_trampoline_call_offset(bool save_regs) void arch_ftrace_update_trampoline(struct ftrace_ops *ops) { ftrace_func_t func; - unsigned char *new; unsigned long offset; unsigned long ip; unsigned int size; - int ret, npages; + const char *new; - if (ops->trampoline) { - /* - * The ftrace_ops caller may set up its own trampoline. - * In such a case, this code must not modify it. - */ - if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) - return; - npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT; - set_memory_rw(ops->trampoline, npages); - } else { + if (!ops->trampoline) { ops->trampoline = create_trampoline(ops, &size); if (!ops->trampoline) return; ops->trampoline_size = size; - npages = PAGE_ALIGN(size) >> PAGE_SHIFT; + return; } + /* + * The ftrace_ops caller may set up its own trampoline. + * In such a case, this code must not modify it. + */ + if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return; + offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); ip = ops->trampoline + offset; - func = ftrace_ops_get_func(ops); - ftrace_update_func_call = (unsigned long)func; - + mutex_lock(&text_mutex); /* Do a safe modify in case the trampoline is executing */ new = ftrace_call_replace(ip, (unsigned long)func); - ret = update_ftrace_func(ip, new); - set_memory_ro(ops->trampoline, npages); - - /* The update should never fail */ - WARN_ON(ret); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + mutex_unlock(&text_mutex); } /* Return the address of the function the trampoline calls */ static void *addr_from_call(void *ptr) { - union ftrace_code_union calc; + union text_poke_insn call; int ret; - ret = probe_kernel_read(&calc, ptr, MCOUNT_INSN_SIZE); + ret = probe_kernel_read(&call, ptr, CALL_INSN_SIZE); if (WARN_ON_ONCE(ret < 0)) return NULL; /* Make sure this is a call */ - if (WARN_ON_ONCE(calc.op != 0xe8)) { - pr_warn("Expected e8, got %x\n", calc.op); + if (WARN_ON_ONCE(call.opcode != CALL_INSN_OPCODE)) { + pr_warn("Expected E8, got %x\n", call.opcode); return NULL; } - return ptr + MCOUNT_INSN_SIZE + calc.offset; + return ptr + CALL_INSN_SIZE + call.disp; } void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, @@ -981,19 +548,18 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE extern void ftrace_graph_call(void); -static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) +static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) { - return ftrace_text_replace(0xe9, ip, addr); + return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr); } static int ftrace_mod_jmp(unsigned long ip, void *func) { - unsigned char *new; + const char *new; - ftrace_update_func_call = 0UL; new = ftrace_jmp_replace(ip, (unsigned long)func); - - return update_ftrace_func(ip, new); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + return 0; } int ftrace_enable_ftrace_graph_caller(void) @@ -1019,10 +585,9 @@ int ftrace_disable_ftrace_graph_caller(void) void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long frame_pointer) { + unsigned long return_hooker = (unsigned long)&return_to_handler; unsigned long old; int faulted; - unsigned long return_hooker = (unsigned long) - &return_to_handler; /* * When resuming from suspend-to-ram, this function can be indirectly diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 073aab525d80..e8a9f8370112 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -12,20 +12,18 @@ #include <asm/frame.h> #include <asm/asm-offsets.h> -# define function_hook __fentry__ -EXPORT_SYMBOL(__fentry__) - #ifdef CONFIG_FRAME_POINTER # define MCOUNT_FRAME 1 /* using frame = true */ #else # define MCOUNT_FRAME 0 /* using frame = false */ #endif -ENTRY(function_hook) +SYM_FUNC_START(__fentry__) ret -END(function_hook) +SYM_FUNC_END(__fentry__) +EXPORT_SYMBOL(__fentry__) -ENTRY(ftrace_caller) +SYM_CODE_START(ftrace_caller) #ifdef CONFIG_FRAME_POINTER /* @@ -85,11 +83,11 @@ ftrace_graph_call: #endif /* This is weak to keep gas from relaxing the jumps */ -WEAK(ftrace_stub) +SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) ret -END(ftrace_caller) +SYM_CODE_END(ftrace_caller) -ENTRY(ftrace_regs_caller) +SYM_CODE_START(ftrace_regs_caller) /* * We're here from an mcount/fentry CALL, and the stack frame looks like: * @@ -138,7 +136,7 @@ ENTRY(ftrace_regs_caller) movl function_trace_op, %ecx # 3rd argument: ftrace_pos pushl %esp # 4th argument: pt_regs -GLOBAL(ftrace_regs_call) +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) call ftrace_stub addl $4, %esp # skip 4th argument @@ -163,9 +161,10 @@ GLOBAL(ftrace_regs_call) popl %eax jmp .Lftrace_ret +SYM_CODE_END(ftrace_regs_caller) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) +SYM_CODE_START(ftrace_graph_caller) pushl %eax pushl %ecx pushl %edx @@ -179,7 +178,7 @@ ENTRY(ftrace_graph_caller) popl %ecx popl %eax ret -END(ftrace_graph_caller) +SYM_CODE_END(ftrace_graph_caller) .globl return_to_handler return_to_handler: diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 809d54397dba..369e61faacfe 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -14,9 +14,6 @@ .code64 .section .entry.text, "ax" -# define function_hook __fentry__ -EXPORT_SYMBOL(__fentry__) - #ifdef CONFIG_FRAME_POINTER /* Save parent and function stack frames (rip and rbp) */ # define MCOUNT_FRAME_SIZE (8+16*2) @@ -88,6 +85,7 @@ EXPORT_SYMBOL(__fentry__) movq %rdi, RDI(%rsp) movq %r8, R8(%rsp) movq %r9, R9(%rsp) + movq $0, ORIG_RAX(%rsp) /* * Save the original RBP. Even though the mcount ABI does not * require this, it helps out callers. @@ -114,7 +112,11 @@ EXPORT_SYMBOL(__fentry__) subq $MCOUNT_INSN_SIZE, %rdi .endm -.macro restore_mcount_regs +.macro restore_mcount_regs save=0 + + /* ftrace_regs_caller or frame pointers require this */ + movq RBP(%rsp), %rbp + movq R9(%rsp), %r9 movq R8(%rsp), %r8 movq RDI(%rsp), %rdi @@ -123,31 +125,29 @@ EXPORT_SYMBOL(__fentry__) movq RCX(%rsp), %rcx movq RAX(%rsp), %rax - /* ftrace_regs_caller can modify %rbp */ - movq RBP(%rsp), %rbp - - addq $MCOUNT_REG_SIZE, %rsp + addq $MCOUNT_REG_SIZE-\save, %rsp .endm #ifdef CONFIG_DYNAMIC_FTRACE -ENTRY(function_hook) +SYM_FUNC_START(__fentry__) retq -ENDPROC(function_hook) +SYM_FUNC_END(__fentry__) +EXPORT_SYMBOL(__fentry__) -ENTRY(ftrace_caller) +SYM_FUNC_START(ftrace_caller) /* save_mcount_regs fills in first two parameters */ save_mcount_regs -GLOBAL(ftrace_caller_op_ptr) +SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx /* regs go into 4th parameter (but make it NULL) */ movq $0, %rcx -GLOBAL(ftrace_call) +SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) call ftrace_stub restore_mcount_regs @@ -157,10 +157,10 @@ GLOBAL(ftrace_call) * think twice before adding any new code or changing the * layout here. */ -GLOBAL(ftrace_epilogue) +SYM_INNER_LABEL(ftrace_epilogue, SYM_L_GLOBAL) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -GLOBAL(ftrace_graph_call) +SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) jmp ftrace_stub #endif @@ -168,19 +168,21 @@ GLOBAL(ftrace_graph_call) * This is weak to keep gas from relaxing the jumps. * It is also used to copy the retq for trampolines. */ -WEAK(ftrace_stub) +SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) retq -ENDPROC(ftrace_caller) +SYM_FUNC_END(ftrace_caller) -ENTRY(ftrace_regs_caller) +SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ pushfq + UNWIND_HINT_SAVE + /* added 8 bytes to save flags */ save_mcount_regs 8 /* save_mcount_regs fills in first two parameters */ -GLOBAL(ftrace_regs_caller_op_ptr) +SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx @@ -209,7 +211,7 @@ GLOBAL(ftrace_regs_caller_op_ptr) /* regs go into 4th parameter */ leaq (%rsp), %rcx -GLOBAL(ftrace_regs_call) +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) call ftrace_stub /* Copy flags back to SS, to restore them */ @@ -228,7 +230,33 @@ GLOBAL(ftrace_regs_call) movq R10(%rsp), %r10 movq RBX(%rsp), %rbx - restore_mcount_regs + movq ORIG_RAX(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE-8(%rsp) + + /* If ORIG_RAX is anything but zero, make this a call to that */ + movq ORIG_RAX(%rsp), %rax + cmpq $0, %rax + je 1f + + /* Swap the flags with orig_rax */ + movq MCOUNT_REG_SIZE(%rsp), %rdi + movq %rdi, MCOUNT_REG_SIZE-8(%rsp) + movq %rax, MCOUNT_REG_SIZE(%rsp) + + restore_mcount_regs 8 + + jmp 2f + +1: restore_mcount_regs + + +2: + /* + * The stack layout is nondetermistic here, depending on which path was + * taken. This confuses objtool and ORC, rightfully so. For now, + * pretend the stack always looks like the non-direct case. + */ + UNWIND_HINT_RESTORE /* Restore flags */ popfq @@ -239,16 +267,16 @@ GLOBAL(ftrace_regs_call) * The trampoline will add the code to jump * to the return. */ -GLOBAL(ftrace_regs_caller_end) +SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) jmp ftrace_epilogue -ENDPROC(ftrace_regs_caller) +SYM_FUNC_END(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ -ENTRY(function_hook) +SYM_FUNC_START(__fentry__) cmpq $ftrace_stub, ftrace_trace_function jnz trace @@ -261,7 +289,7 @@ fgraph_trace: jnz ftrace_graph_caller #endif -GLOBAL(ftrace_stub) +SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL) retq trace: @@ -279,11 +307,12 @@ trace: restore_mcount_regs jmp fgraph_trace -ENDPROC(function_hook) +SYM_FUNC_END(__fentry__) +EXPORT_SYMBOL(__fentry__) #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) +SYM_FUNC_START(ftrace_graph_caller) /* Saves rbp into %rdx and fills first parameter */ save_mcount_regs @@ -294,9 +323,9 @@ ENTRY(ftrace_graph_caller) restore_mcount_regs retq -ENDPROC(ftrace_graph_caller) +SYM_FUNC_END(ftrace_graph_caller) -ENTRY(return_to_handler) +SYM_CODE_START(return_to_handler) UNWIND_HINT_EMPTY subq $24, %rsp @@ -312,5 +341,5 @@ ENTRY(return_to_handler) movq (%rsp), %rax addq $24, %rsp JMP_NOSPEC %rdi -END(return_to_handler) +SYM_CODE_END(return_to_handler) #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 29ffa495bd1c..206a4b6144c2 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -222,13 +222,31 @@ unsigned long __head __startup_64(unsigned long physaddr, * we might write invalid pmds, when the kernel is relocated * cleanup_highmap() fixes this up along with the mappings * beyond _end. + * + * Only the region occupied by the kernel image has so far + * been checked against the table of usable memory regions + * provided by the firmware, so invalidate pages outside that + * region. A page table entry that maps to a reserved area of + * memory would allow processor speculation into that area, + * and on some hardware (particularly the UV platform) even + * speculative access to some reserved areas is caught as an + * error, causing the BIOS to halt the system. */ pmd = fixup_pointer(level2_kernel_pgt, physaddr); - for (i = 0; i < PTRS_PER_PMD; i++) { + + /* invalidate pages before the kernel image */ + for (i = 0; i < pmd_index((unsigned long)_text); i++) + pmd[i] &= ~_PAGE_PRESENT; + + /* fixup pages that are part of the kernel image */ + for (; i <= pmd_index((unsigned long)_end); i++) if (pmd[i] & _PAGE_PRESENT) pmd[i] += load_delta; - } + + /* invalidate pages after the kernel image */ + for (; i < PTRS_PER_PMD; i++) + pmd[i] &= ~_PAGE_PRESENT; /* * Fixup phys_base - remove the memory encryption mask to obtain diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 30f9cb2c0b55..3923ab4630d7 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -64,7 +64,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) * can. */ __HEAD -ENTRY(startup_32) +SYM_CODE_START(startup_32) movl pa(initial_stack),%ecx /* test KEEP_SEGMENTS flag to see if the bootloader is asking @@ -156,7 +156,7 @@ ENTRY(startup_32) jmp *%eax .Lbad_subarch: -WEAK(xen_entry) +SYM_INNER_LABEL_ALIGN(xen_entry, SYM_L_WEAK) /* Unknown implementation; there's really nothing we can do at this point. */ ud2a @@ -172,6 +172,7 @@ num_subarch_entries = (. - subarch_entries) / 4 #else jmp .Ldefault_entry #endif /* CONFIG_PARAVIRT */ +SYM_CODE_END(startup_32) #ifdef CONFIG_HOTPLUG_CPU /* @@ -179,12 +180,12 @@ num_subarch_entries = (. - subarch_entries) / 4 * up already except stack. We just set up stack here. Then call * start_secondary(). */ -ENTRY(start_cpu0) +SYM_FUNC_START(start_cpu0) movl initial_stack, %ecx movl %ecx, %esp call *(initial_code) 1: jmp 1b -ENDPROC(start_cpu0) +SYM_FUNC_END(start_cpu0) #endif /* @@ -195,7 +196,7 @@ ENDPROC(start_cpu0) * If cpu hotplug is not supported then this code can go in init section * which will be freed later */ -ENTRY(startup_32_smp) +SYM_FUNC_START(startup_32_smp) cld movl $(__BOOT_DS),%eax movl %eax,%ds @@ -362,7 +363,7 @@ ENTRY(startup_32_smp) call *(initial_code) 1: jmp 1b -ENDPROC(startup_32_smp) +SYM_FUNC_END(startup_32_smp) #include "verify_cpu.S" @@ -392,7 +393,7 @@ setup_once: andl $0,setup_once_ref /* Once is enough, thanks */ ret -ENTRY(early_idt_handler_array) +SYM_FUNC_START(early_idt_handler_array) # 36(%esp) %eflags # 32(%esp) %cs # 28(%esp) %eip @@ -407,9 +408,9 @@ ENTRY(early_idt_handler_array) i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handler_array) +SYM_FUNC_END(early_idt_handler_array) -early_idt_handler_common: +SYM_CODE_START_LOCAL(early_idt_handler_common) /* * The stack is the hardware frame, an error code or zero, and the * vector number. @@ -460,10 +461,10 @@ early_idt_handler_common: decl %ss:early_recursion_flag addl $4, %esp /* pop pt_regs->orig_ax */ iret -ENDPROC(early_idt_handler_common) +SYM_CODE_END(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ -ENTRY(early_ignore_irq) +SYM_FUNC_START(early_ignore_irq) cld #ifdef CONFIG_PRINTK pushl %eax @@ -498,19 +499,16 @@ ENTRY(early_ignore_irq) hlt_loop: hlt jmp hlt_loop -ENDPROC(early_ignore_irq) +SYM_FUNC_END(early_ignore_irq) __INITDATA .align 4 -GLOBAL(early_recursion_flag) - .long 0 +SYM_DATA(early_recursion_flag, .long 0) __REFDATA .align 4 -ENTRY(initial_code) - .long i386_start_kernel -ENTRY(setup_once_ref) - .long setup_once +SYM_DATA(initial_code, .long i386_start_kernel) +SYM_DATA(setup_once_ref, .long setup_once) #ifdef CONFIG_PAGE_TABLE_ISOLATION #define PGD_ALIGN (2 * PAGE_SIZE) @@ -553,7 +551,7 @@ EXPORT_SYMBOL(empty_zero_page) __PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ .align PGD_ALIGN -ENTRY(initial_page_table) +SYM_DATA_START(initial_page_table) .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 @@ -571,17 +569,28 @@ ENTRY(initial_page_table) # error "Kernel PMDs should be 1, 2 or 3" # endif .align PAGE_SIZE /* needs to be page-sized too */ + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * PTI needs another page so sync_initial_pagetable() works correctly + * and does not scribble over the data which is placed behind the + * actual initial_page_table. See clone_pgd_range(). + */ + .fill 1024, 4, 0 +#endif + +SYM_DATA_END(initial_page_table) #endif .data .balign 4 -ENTRY(initial_stack) - /* - * The SIZEOF_PTREGS gap is a convention which helps the in-kernel - * unwinder reliably detect the end of the stack. - */ - .long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \ - TOP_OF_KERNEL_STACK_PADDING; +/* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder + * reliably detect the end of the stack. + */ +SYM_DATA(initial_stack, + .long init_thread_union + THREAD_SIZE - + SIZEOF_PTREGS - TOP_OF_KERNEL_STACK_PADDING) __INITRODATA int_msg: @@ -597,27 +606,28 @@ int_msg: */ .data -.globl boot_gdt_descr - ALIGN # early boot GDT descriptor (must use 1:1 address mapping) .word 0 # 32 bit align gdt_desc.address -boot_gdt_descr: +SYM_DATA_START_LOCAL(boot_gdt_descr) .word __BOOT_DS+7 .long boot_gdt - __PAGE_OFFSET +SYM_DATA_END(boot_gdt_descr) # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address -ENTRY(early_gdt_descr) +SYM_DATA_START(early_gdt_descr) .word GDT_ENTRIES*8-1 .long gdt_page /* Overwritten for secondary CPUs */ +SYM_DATA_END(early_gdt_descr) /* * The boot_gdt must mirror the equivalent in setup.S and is * used only for booting. */ .align L1_CACHE_BYTES -ENTRY(boot_gdt) +SYM_DATA_START(boot_gdt) .fill GDT_ENTRY_BOOT_CS,8,0 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ +SYM_DATA_END(boot_gdt) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index f3d3e9646a99..4bbc770af632 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -49,8 +49,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .text __HEAD .code64 - .globl startup_64 -startup_64: +SYM_CODE_START_NOALIGN(startup_64) UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, @@ -90,7 +89,9 @@ startup_64: /* Form the CR3 value being sure to include the CR3 modifier */ addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f -ENTRY(secondary_startup_64) +SYM_CODE_END(startup_64) + +SYM_CODE_START(secondary_startup_64) UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, @@ -240,7 +241,7 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq .Lafter_lret: -END(secondary_startup_64) +SYM_CODE_END(secondary_startup_64) #include "verify_cpu.S" @@ -250,30 +251,28 @@ END(secondary_startup_64) * up already except stack. We just set up stack here. Then call * start_secondary() via .Ljump_to_C_code. */ -ENTRY(start_cpu0) +SYM_CODE_START(start_cpu0) UNWIND_HINT_EMPTY movq initial_stack(%rip), %rsp jmp .Ljump_to_C_code -END(start_cpu0) +SYM_CODE_END(start_cpu0) #endif /* Both SMP bootup and ACPI suspend change these variables */ __REFDATA .balign 8 - GLOBAL(initial_code) - .quad x86_64_start_kernel - GLOBAL(initial_gs) - .quad INIT_PER_CPU_VAR(fixed_percpu_data) - GLOBAL(initial_stack) - /* - * The SIZEOF_PTREGS gap is a convention which helps the in-kernel - * unwinder reliably detect the end of the stack. - */ - .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS +SYM_DATA(initial_code, .quad x86_64_start_kernel) +SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data)) + +/* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder + * reliably detect the end of the stack. + */ +SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS) __FINITDATA __INIT -ENTRY(early_idt_handler_array) +SYM_CODE_START(early_idt_handler_array) i = 0 .rept NUM_EXCEPTION_VECTORS .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 @@ -289,9 +288,9 @@ ENTRY(early_idt_handler_array) .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr UNWIND_HINT_IRET_REGS offset=16 -END(early_idt_handler_array) +SYM_CODE_END(early_idt_handler_array) -early_idt_handler_common: +SYM_CODE_START_LOCAL(early_idt_handler_common) /* * The stack is the hardware frame, an error code or zero, and the * vector number. @@ -333,17 +332,11 @@ early_idt_handler_common: 20: decl early_recursion_flag(%rip) jmp restore_regs_and_return_to_kernel -END(early_idt_handler_common) +SYM_CODE_END(early_idt_handler_common) - __INITDATA - .balign 4 -GLOBAL(early_recursion_flag) - .long 0 - -#define NEXT_PAGE(name) \ - .balign PAGE_SIZE; \ -GLOBAL(name) +#define SYM_DATA_START_PAGE_ALIGNED(name) \ + SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) #ifdef CONFIG_PAGE_TABLE_ISOLATION /* @@ -358,11 +351,11 @@ GLOBAL(name) */ #define PTI_USER_PGD_FILL 512 /* This ensures they are 8k-aligned: */ -#define NEXT_PGD_PAGE(name) \ - .balign 2 * PAGE_SIZE; \ -GLOBAL(name) +#define SYM_DATA_START_PTI_ALIGNED(name) \ + SYM_START(name, SYM_L_GLOBAL, .balign 2 * PAGE_SIZE) #else -#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define SYM_DATA_START_PTI_ALIGNED(name) \ + SYM_DATA_START_PAGE_ALIGNED(name) #define PTI_USER_PGD_FILL 0 #endif @@ -375,17 +368,23 @@ GLOBAL(name) .endr __INITDATA -NEXT_PGD_PAGE(early_top_pgt) + .balign 4 + +SYM_DATA_START_PTI_ALIGNED(early_top_pgt) .fill 512,8,0 .fill PTI_USER_PGD_FILL,8,0 +SYM_DATA_END(early_top_pgt) -NEXT_PAGE(early_dynamic_pgts) +SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 +SYM_DATA_END(early_dynamic_pgts) + +SYM_DATA(early_recursion_flag, .long 0) .data #if defined(CONFIG_XEN_PV) || defined(CONFIG_PVH) -NEXT_PGD_PAGE(init_top_pgt) +SYM_DATA_START_PTI_ALIGNED(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + L4_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC @@ -393,11 +392,13 @@ NEXT_PGD_PAGE(init_top_pgt) /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .fill PTI_USER_PGD_FILL,8,0 +SYM_DATA_END(init_top_pgt) -NEXT_PAGE(level3_ident_pgt) +SYM_DATA_START_PAGE_ALIGNED(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .fill 511, 8, 0 -NEXT_PAGE(level2_ident_pgt) +SYM_DATA_END(level3_ident_pgt) +SYM_DATA_START_PAGE_ALIGNED(level2_ident_pgt) /* * Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. @@ -407,25 +408,29 @@ NEXT_PAGE(level2_ident_pgt) * the CPU should ignore the bit. */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +SYM_DATA_END(level2_ident_pgt) #else -NEXT_PGD_PAGE(init_top_pgt) +SYM_DATA_START_PTI_ALIGNED(init_top_pgt) .fill 512,8,0 .fill PTI_USER_PGD_FILL,8,0 +SYM_DATA_END(init_top_pgt) #endif #ifdef CONFIG_X86_5LEVEL -NEXT_PAGE(level4_kernel_pgt) +SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt) .fill 511,8,0 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC +SYM_DATA_END(level4_kernel_pgt) #endif -NEXT_PAGE(level3_kernel_pgt) +SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC +SYM_DATA_END(level3_kernel_pgt) -NEXT_PAGE(level2_kernel_pgt) +SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) /* * 512 MB kernel mapping. We spend a full page on this pagetable * anyway. @@ -442,8 +447,9 @@ NEXT_PAGE(level2_kernel_pgt) */ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) +SYM_DATA_END(level2_kernel_pgt) -NEXT_PAGE(level2_fixmap_pgt) +SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 pgtno = 0 .rept (FIXMAP_PMD_NUM) @@ -453,31 +459,32 @@ NEXT_PAGE(level2_fixmap_pgt) .endr /* 6 MB reserved space + a 2MB hole */ .fill 4,8,0 +SYM_DATA_END(level2_fixmap_pgt) -NEXT_PAGE(level1_fixmap_pgt) +SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt) .rept (FIXMAP_PMD_NUM) .fill 512,8,0 .endr +SYM_DATA_END(level1_fixmap_pgt) #undef PMDS .data .align 16 - .globl early_gdt_descr -early_gdt_descr: - .word GDT_ENTRIES*8-1 -early_gdt_descr_base: - .quad INIT_PER_CPU_VAR(gdt_page) - -ENTRY(phys_base) - /* This must match the first entry in level2_kernel_pgt */ - .quad 0x0000000000000000 + +SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1) +SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page)) + + .align 16 +/* This must match the first entry in level2_kernel_pgt */ +SYM_DATA(phys_base, .quad 0x0) EXPORT_SYMBOL(phys_base) #include "../../x86/xen/xen-head.S" __PAGE_ALIGNED_BSS -NEXT_PAGE(empty_zero_page) +SYM_DATA_START_PAGE_ALIGNED(empty_zero_page) .skip PAGE_SIZE +SYM_DATA_END(empty_zero_page) EXPORT_SYMBOL(empty_zero_page) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index c6f791bc481e..7a50f0b62a70 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -84,7 +84,7 @@ static inline void hpet_writel(unsigned int d, unsigned int a) static inline void hpet_set_mapping(void) { - hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + hpet_virt_address = ioremap(hpet_address, HPET_MMAP_SIZE); } static inline void hpet_clear_mapping(void) diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c index 4c407833faca..4d4f5d9faac3 100644 --- a/arch/x86/kernel/ima_arch.c +++ b/arch/x86/kernel/ima_arch.c @@ -74,9 +74,9 @@ bool arch_ima_get_secureboot(void) /* secureboot arch rules */ static const char * const sb_arch_rules[] = { -#if !IS_ENABLED(CONFIG_KEXEC_VERIFY_SIG) +#if !IS_ENABLED(CONFIG_KEXEC_SIG) "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig", -#endif /* CONFIG_KEXEC_VERIFY_SIG */ +#endif /* CONFIG_KEXEC_SIG */ "measure func=KEXEC_KERNEL_CHECK", #if !IS_ENABLED(CONFIG_MODULE_SIG) "appraise func=MODULE_CHECK appraise_type=imasig", diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 0fe1c8782208..8abeee0dd7bf 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -3,35 +3,74 @@ * This contains the io-permission bitmap code - written by obz, with changes * by Linus. 32/64 bits code unification by Miguel BotĂłn. */ - -#include <linux/sched.h> -#include <linux/sched/task_stack.h> -#include <linux/kernel.h> #include <linux/capability.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/ioport.h> -#include <linux/smp.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/thread_info.h> +#include <linux/security.h> #include <linux/syscalls.h> #include <linux/bitmap.h> -#include <asm/syscalls.h> +#include <linux/ioport.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include <asm/io_bitmap.h> #include <asm/desc.h> +#ifdef CONFIG_X86_IOPL_IOPERM + +static atomic64_t io_bitmap_sequence; + +void io_bitmap_share(struct task_struct *tsk) +{ + /* Can be NULL when current->thread.iopl_emul == 3 */ + if (current->thread.io_bitmap) { + /* + * Take a refcount on current's bitmap. It can be used by + * both tasks as long as none of them changes the bitmap. + */ + refcount_inc(¤t->thread.io_bitmap->refcnt); + tsk->thread.io_bitmap = current->thread.io_bitmap; + } + set_tsk_thread_flag(tsk, TIF_IO_BITMAP); +} + +static void task_update_io_bitmap(void) +{ + struct thread_struct *t = ¤t->thread; + + if (t->iopl_emul == 3 || t->io_bitmap) { + /* TSS update is handled on exit to user space */ + set_thread_flag(TIF_IO_BITMAP); + } else { + clear_thread_flag(TIF_IO_BITMAP); + /* Invalidate TSS */ + preempt_disable(); + tss_update_io_bitmap(); + preempt_enable(); + } +} + +void io_bitmap_exit(void) +{ + struct io_bitmap *iobm = current->thread.io_bitmap; + + current->thread.io_bitmap = NULL; + task_update_io_bitmap(); + if (iobm && refcount_dec_and_test(&iobm->refcnt)) + kfree(iobm); +} + /* - * this changes the io permissions bitmap in the current task. + * This changes the io permissions bitmap in the current task. */ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) { struct thread_struct *t = ¤t->thread; - struct tss_struct *tss; - unsigned int i, max_long, bytes, bytes_updated; + unsigned int i, max_long; + struct io_bitmap *iobm; if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) + if (turn_on && (!capable(CAP_SYS_RAWIO) || + security_locked_down(LOCKDOWN_IOPORT))) return -EPERM; /* @@ -39,59 +78,72 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) * IO bitmap up. ioperm() is much less timing critical than clone(), * this is why we delay this operation until now: */ - if (!t->io_bitmap_ptr) { - unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - - if (!bitmap) + iobm = t->io_bitmap; + if (!iobm) { + /* No point to allocate a bitmap just to clear permissions */ + if (!turn_on) + return 0; + iobm = kmalloc(sizeof(*iobm), GFP_KERNEL); + if (!iobm) return -ENOMEM; - memset(bitmap, 0xff, IO_BITMAP_BYTES); - t->io_bitmap_ptr = bitmap; - set_thread_flag(TIF_IO_BITMAP); + memset(iobm->bitmap, 0xff, sizeof(iobm->bitmap)); + refcount_set(&iobm->refcnt, 1); + } - /* - * Now that we have an IO bitmap, we need our TSS limit to be - * correct. It's fine if we are preempted after doing this: - * with TIF_IO_BITMAP set, context switches will keep our TSS - * limit correct. - */ - preempt_disable(); - refresh_tss_limit(); - preempt_enable(); + /* + * If the bitmap is not shared, then nothing can take a refcount as + * current can obviously not fork at the same time. If it's shared + * duplicate it and drop the refcount on the original one. + */ + if (refcount_read(&iobm->refcnt) > 1) { + iobm = kmemdup(iobm, sizeof(*iobm), GFP_KERNEL); + if (!iobm) + return -ENOMEM; + refcount_set(&iobm->refcnt, 1); + io_bitmap_exit(); } /* - * do it in the per-thread copy and in the TSS ... - * - * Disable preemption via get_cpu() - we must not switch away - * because the ->io_bitmap_max value must match the bitmap - * contents: + * Store the bitmap pointer (might be the same if the task already + * head one). Must be done here so freeing the bitmap when all + * permissions are dropped has the pointer set up. */ - tss = &per_cpu(cpu_tss_rw, get_cpu()); + t->io_bitmap = iobm; + /* Mark it active for context switching and exit to user mode */ + set_thread_flag(TIF_IO_BITMAP); + /* + * Update the tasks bitmap. The update of the TSS bitmap happens on + * exit to user mode. So this needs no protection. + */ if (turn_on) - bitmap_clear(t->io_bitmap_ptr, from, num); + bitmap_clear(iobm->bitmap, from, num); else - bitmap_set(t->io_bitmap_ptr, from, num); + bitmap_set(iobm->bitmap, from, num); /* * Search for a (possibly new) maximum. This is simple and stupid, * to keep it obviously correct: */ - max_long = 0; - for (i = 0; i < IO_BITMAP_LONGS; i++) - if (t->io_bitmap_ptr[i] != ~0UL) + max_long = UINT_MAX; + for (i = 0; i < IO_BITMAP_LONGS; i++) { + if (iobm->bitmap[i] != ~0UL) max_long = i; + } + /* All permissions dropped? */ + if (max_long == UINT_MAX) { + io_bitmap_exit(); + return 0; + } - bytes = (max_long + 1) * sizeof(unsigned long); - bytes_updated = max(bytes, t->io_bitmap_max); - - t->io_bitmap_max = bytes; - - /* Update the TSS: */ - memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); + iobm->max = (max_long + 1) * sizeof(unsigned long); - put_cpu(); + /* + * Update the sequence number to force a TSS update on return to + * user mode. + */ + iobm->sequence = atomic64_add_return(1, &io_bitmap_sequence); return 0; } @@ -102,37 +154,61 @@ SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on) } /* - * sys_iopl has to be used when you want to access the IO ports - * beyond the 0x3ff range: to get the full 65536 ports bitmapped - * you'd need 8kB of bitmaps/process, which is a bit excessive. + * The sys_iopl functionality depends on the level argument, which if + * granted for the task is used to enable access to all 65536 I/O ports. + * + * This does not use the IOPL mechanism provided by the CPU as that would + * also allow the user space task to use the CLI/STI instructions. * - * Here we just change the flags value on the stack: we allow - * only the super-user to do it. This depends on the stack-layout - * on system-call entry - see also fork() and the signal handling - * code. + * Disabling interrupts in a user space task is dangerous as it might lock + * up the machine and the semantics vs. syscalls and exceptions is + * undefined. + * + * Setting IOPL to level 0-2 is disabling I/O permissions. Level 3 + * 3 enables them. + * + * IOPL is strictly per thread and inherited on fork. */ SYSCALL_DEFINE1(iopl, unsigned int, level) { - struct pt_regs *regs = current_pt_regs(); struct thread_struct *t = ¤t->thread; - - /* - * Careful: the IOPL bits in regs->flags are undefined under Xen PV - * and changing them has no effect. - */ - unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT; + unsigned int old; if (level > 3) return -EINVAL; + + old = t->iopl_emul; + + /* No point in going further if nothing changes */ + if (level == old) + return 0; + /* Trying to gain more privileges? */ if (level > old) { - if (!capable(CAP_SYS_RAWIO)) + if (!capable(CAP_SYS_RAWIO) || + security_locked_down(LOCKDOWN_IOPORT)) return -EPERM; } - regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | - (level << X86_EFLAGS_IOPL_BIT); - t->iopl = level << X86_EFLAGS_IOPL_BIT; - set_iopl_mask(t->iopl); + + t->iopl_emul = level; + task_update_io_bitmap(); return 0; } + +#else /* CONFIG_X86_IOPL_IOPERM */ + +long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + return -ENOSYS; +} +SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on) +{ + return -ENOSYS; +} + +SYSCALL_DEFINE1(iopl, unsigned int, level) +{ + return -ENOSYS; +} +#endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 4215653f8a8e..21efee32e2b1 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -243,11 +243,15 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); desc = __this_cpu_read(vector_irq[vector]); - - if (!handle_irq(desc, regs)) { + if (likely(!IS_ERR_OR_NULL(desc))) { + if (IS_ENABLED(CONFIG_X86_32)) + handle_irq(desc, regs); + else + generic_handle_irq_desc(desc); + } else { ack_APIC_irq(); - if (desc != VECTOR_RETRIGGERED && desc != VECTOR_SHUTDOWN) { + if (desc == VECTOR_UNUSED) { pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n", __func__, smp_processor_id(), vector); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index fc34816c6f04..a759ca97cd01 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -148,18 +148,13 @@ void do_softirq_own_stack(void) call_on_stack(__do_softirq, isp); } -bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) +void handle_irq(struct irq_desc *desc, struct pt_regs *regs) { int overflow = check_stack_overflow(); - if (IS_ERR_OR_NULL(desc)) - return false; - if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) { if (unlikely(overflow)) print_stack_overflow(); generic_handle_irq_desc(desc); } - - return true; } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 6bf6517a05bb..12df3a4abfdd 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -26,15 +26,6 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; DECLARE_INIT_PER_CPU(irq_stack_backing_store); -bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) -{ - if (IS_ERR_OR_NULL(desc)) - return false; - - generic_handle_irq_desc(desc); - return true; -} - #ifdef CONFIG_VMAP_STACK /* * VMAP the backing store with guard pages diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index ddeeaac8adda..0db0375235b4 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -7,20 +7,20 @@ /* * unsigned long native_save_fl(void) */ -ENTRY(native_save_fl) +SYM_FUNC_START(native_save_fl) pushf pop %_ASM_AX ret -ENDPROC(native_save_fl) +SYM_FUNC_END(native_save_fl) EXPORT_SYMBOL(native_save_fl) /* * void native_restore_fl(unsigned long flags) * %eax/%rdi: flags */ -ENTRY(native_restore_fl) +SYM_FUNC_START(native_restore_fl) push %_ASM_ARG1 popf ret -ENDPROC(native_restore_fl) +SYM_FUNC_END(native_restore_fl) EXPORT_SYMBOL(native_restore_fl) diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 3ad34f01de2a..6eb8b50ea07e 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -11,6 +11,7 @@ #include <linux/acpi_pmtmr.h> #include <linux/kernel.h> #include <linux/reboot.h> +#include <linux/serial_8250.h> #include <asm/apic.h> #include <asm/cpu.h> #include <asm/hypervisor.h> @@ -21,9 +22,24 @@ #include <asm/setup.h> #include <asm/jailhouse_para.h> -static __initdata struct jailhouse_setup_data setup_data; +static struct jailhouse_setup_data setup_data; +#define SETUP_DATA_V1_LEN (sizeof(setup_data.hdr) + sizeof(setup_data.v1)) +#define SETUP_DATA_V2_LEN (SETUP_DATA_V1_LEN + sizeof(setup_data.v2)) + static unsigned int precalibrated_tsc_khz; +static void jailhouse_setup_irq(unsigned int irq) +{ + struct mpc_intsrc mp_irq = { + .type = MP_INTSRC, + .irqtype = mp_INT, + .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE, + .srcbusirq = irq, + .dstirq = irq, + }; + mp_save_irq(&mp_irq); +} + static uint32_t jailhouse_cpuid_base(void) { if (boot_cpu_data.cpuid_level < 0 || @@ -45,7 +61,7 @@ static void jailhouse_get_wallclock(struct timespec64 *now) static void __init jailhouse_timer_init(void) { - lapic_timer_period = setup_data.apic_khz * (1000 / HZ); + lapic_timer_period = setup_data.v1.apic_khz * (1000 / HZ); } static unsigned long jailhouse_get_tsc(void) @@ -77,33 +93,28 @@ static void __init jailhouse_get_smp_config(unsigned int early) .type = IOAPIC_DOMAIN_STRICT, .ops = &mp_ioapic_irqdomain_ops, }; - struct mpc_intsrc mp_irq = { - .type = MP_INTSRC, - .irqtype = mp_INT, - .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE, - }; unsigned int cpu; jailhouse_x2apic_init(); register_lapic_address(0xfee00000); - for (cpu = 0; cpu < setup_data.num_cpus; cpu++) { - generic_processor_info(setup_data.cpu_ids[cpu], + for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) { + generic_processor_info(setup_data.v1.cpu_ids[cpu], boot_cpu_apic_version); } smp_found_config = 1; - if (setup_data.standard_ioapic) { + if (setup_data.v1.standard_ioapic) { mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg); - /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */ - mp_irq.srcbusirq = mp_irq.dstirq = 3; - mp_save_irq(&mp_irq); - - mp_irq.srcbusirq = mp_irq.dstirq = 4; - mp_save_irq(&mp_irq); + if (IS_ENABLED(CONFIG_SERIAL_8250) && + setup_data.hdr.version < 2) { + /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */ + jailhouse_setup_irq(3); + jailhouse_setup_irq(4); + } } } @@ -126,9 +137,9 @@ static int __init jailhouse_pci_arch_init(void) pcibios_last_bus = 0xff; #ifdef CONFIG_PCI_MMCONFIG - if (setup_data.pci_mmconfig_base) { + if (setup_data.v1.pci_mmconfig_base) { pci_mmconfig_add(0, 0, pcibios_last_bus, - setup_data.pci_mmconfig_base); + setup_data.v1.pci_mmconfig_base); pci_mmcfg_arch_init(); } #endif @@ -136,9 +147,57 @@ static int __init jailhouse_pci_arch_init(void) return 0; } +#ifdef CONFIG_SERIAL_8250 +static inline bool jailhouse_uart_enabled(unsigned int uart_nr) +{ + return setup_data.v2.flags & BIT(uart_nr); +} + +static void jailhouse_serial_fixup(int port, struct uart_port *up, + u32 *capabilities) +{ + static const u16 pcuart_base[] = {0x3f8, 0x2f8, 0x3e8, 0x2e8}; + unsigned int n; + + for (n = 0; n < ARRAY_SIZE(pcuart_base); n++) { + if (pcuart_base[n] != up->iobase) + continue; + + if (jailhouse_uart_enabled(n)) { + pr_info("Enabling UART%u (port 0x%lx)\n", n, + up->iobase); + jailhouse_setup_irq(up->irq); + } else { + /* Deactivate UART if access isn't allowed */ + up->iobase = 0; + } + break; + } +} + +static void __init jailhouse_serial_workaround(void) +{ + /* + * There are flags inside setup_data that indicate availability of + * platform UARTs since setup data version 2. + * + * In case of version 1, we don't know which UARTs belong Linux. In + * this case, unconditionally register 1:1 mapping for legacy UART IRQs + * 3 and 4. + */ + if (setup_data.hdr.version > 1) + serial8250_set_isa_configurator(jailhouse_serial_fixup); +} +#else /* !CONFIG_SERIAL_8250 */ +static inline void jailhouse_serial_workaround(void) +{ +} +#endif /* CONFIG_SERIAL_8250 */ + static void __init jailhouse_init_platform(void) { u64 pa_data = boot_params.hdr.setup_data; + unsigned long setup_data_len; struct setup_data header; void *mapping; @@ -163,16 +222,8 @@ static void __init jailhouse_init_platform(void) memcpy(&header, mapping, sizeof(header)); early_memunmap(mapping, sizeof(header)); - if (header.type == SETUP_JAILHOUSE && - header.len >= sizeof(setup_data)) { - pa_data += offsetof(struct setup_data, data); - - mapping = early_memremap(pa_data, sizeof(setup_data)); - memcpy(&setup_data, mapping, sizeof(setup_data)); - early_memunmap(mapping, sizeof(setup_data)); - + if (header.type == SETUP_JAILHOUSE) break; - } pa_data = header.next; } @@ -180,13 +231,28 @@ static void __init jailhouse_init_platform(void) if (!pa_data) panic("Jailhouse: No valid setup data found"); - if (setup_data.compatible_version > JAILHOUSE_SETUP_REQUIRED_VERSION) - panic("Jailhouse: Unsupported setup data structure"); - - pmtmr_ioport = setup_data.pm_timer_address; + /* setup data must at least contain the header */ + if (header.len < sizeof(setup_data.hdr)) + goto unsupported; + + pa_data += offsetof(struct setup_data, data); + setup_data_len = min_t(unsigned long, sizeof(setup_data), + (unsigned long)header.len); + mapping = early_memremap(pa_data, setup_data_len); + memcpy(&setup_data, mapping, setup_data_len); + early_memunmap(mapping, setup_data_len); + + if (setup_data.hdr.version == 0 || + setup_data.hdr.compatible_version != + JAILHOUSE_SETUP_REQUIRED_VERSION || + (setup_data.hdr.version == 1 && header.len < SETUP_DATA_V1_LEN) || + (setup_data.hdr.version >= 2 && header.len < SETUP_DATA_V2_LEN)) + goto unsupported; + + pmtmr_ioport = setup_data.v1.pm_timer_address; pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport); - precalibrated_tsc_khz = setup_data.tsc_khz; + precalibrated_tsc_khz = setup_data.v1.tsc_khz; setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); pci_probe = 0; @@ -196,6 +262,12 @@ static void __init jailhouse_init_platform(void) * are none in a non-root cell. */ disable_acpi(); + + jailhouse_serial_workaround(); + return; + +unsupported: + panic("Jailhouse: Unsupported setup data structure"); } bool jailhouse_paravirt(void) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 044053235302..9c4498ea0b3c 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -16,15 +16,7 @@ #include <asm/alternative.h> #include <asm/text-patching.h> -union jump_code_union { - char code[JUMP_LABEL_NOP_SIZE]; - struct { - char jump; - int offset; - } __attribute__((packed)); -}; - -static void bug_at(unsigned char *ip, int line) +static void bug_at(const void *ip, int line) { /* * The location is not an op that we were expecting. @@ -35,42 +27,42 @@ static void bug_at(unsigned char *ip, int line) BUG(); } -static void __jump_label_set_jump_code(struct jump_entry *entry, - enum jump_label_type type, - union jump_code_union *code, - int init) +static const void * +__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init) { const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; - const void *expect; + const void *expect, *code; + const void *addr, *dest; int line; - code->jump = 0xe9; - code->offset = jump_entry_target(entry) - - (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); + addr = (void *)jump_entry_code(entry); + dest = (void *)jump_entry_target(entry); + + code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); if (init) { expect = default_nop; line = __LINE__; } else if (type == JUMP_LABEL_JMP) { expect = ideal_nop; line = __LINE__; } else { - expect = code->code; line = __LINE__; + expect = code; line = __LINE__; } - if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE)) - bug_at((void *)jump_entry_code(entry), line); + if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE)) + bug_at(addr, line); if (type == JUMP_LABEL_NOP) - memcpy(code, ideal_nop, JUMP_LABEL_NOP_SIZE); + code = ideal_nop; + + return code; } -static void __ref __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - int init) +static void inline __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) { - union jump_code_union code; - - __jump_label_set_jump_code(entry, type, &code, init); + const void *opcode = __jump_label_set_jump_code(entry, type, init); /* * As long as only a single processor is running and the code is still @@ -84,32 +76,33 @@ static void __ref __jump_label_transform(struct jump_entry *entry, * always nop being the 'currently valid' instruction */ if (init || system_state == SYSTEM_BOOTING) { - text_poke_early((void *)jump_entry_code(entry), &code, + text_poke_early((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE); return; } - text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, - (void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); + text_poke_bp((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL); } -void arch_jump_label_transform(struct jump_entry *entry, - enum jump_label_type type) +static void __ref jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) { mutex_lock(&text_mutex); - __jump_label_transform(entry, type, 0); + __jump_label_transform(entry, type, init); mutex_unlock(&text_mutex); } -#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) -static struct text_poke_loc tp_vec[TP_VEC_MAX]; -static int tp_vec_nr; +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ + jump_label_transform(entry, type, 0); +} bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) { - struct text_poke_loc *tp; - void *entry_code; + const void *opcode; if (system_state == SYSTEM_BOOTING) { /* @@ -119,55 +112,19 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, return true; } - /* - * No more space in the vector, tell upper layer to apply - * the queue before continuing. - */ - if (tp_vec_nr == TP_VEC_MAX) - return false; - - tp = &tp_vec[tp_vec_nr]; - - entry_code = (void *)jump_entry_code(entry); - - /* - * The INT3 handler will do a bsearch in the queue, so we need entries - * to be sorted. We can survive an unsorted list by rejecting the entry, - * forcing the generic jump_label code to apply the queue. Warning once, - * to raise the attention to the case of an unsorted entry that is - * better not happen, because, in the worst case we will perform in the - * same way as we do without batching - with some more overhead. - */ - if (tp_vec_nr > 0) { - int prev = tp_vec_nr - 1; - struct text_poke_loc *prev_tp = &tp_vec[prev]; - - if (WARN_ON_ONCE(prev_tp->addr > entry_code)) - return false; - } - - __jump_label_set_jump_code(entry, type, - (union jump_code_union *) &tp->opcode, 0); - - tp->addr = entry_code; - tp->detour = entry_code + JUMP_LABEL_NOP_SIZE; - tp->len = JUMP_LABEL_NOP_SIZE; - - tp_vec_nr++; - + mutex_lock(&text_mutex); + opcode = __jump_label_set_jump_code(entry, type, 0); + text_poke_queue((void *)jump_entry_code(entry), + opcode, JUMP_LABEL_NOP_SIZE, NULL); + mutex_unlock(&text_mutex); return true; } void arch_jump_label_transform_apply(void) { - if (!tp_vec_nr) - return; - mutex_lock(&text_mutex); - text_poke_bp_batch(tp_vec, tp_vec_nr); + text_poke_finish(); mutex_unlock(&text_mutex); - - tp_vec_nr = 0; } static enum { @@ -196,5 +153,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, jlstate = JL_STATE_NO_UPDATE; } if (jlstate == JL_STATE_UPDATE) - __jump_label_transform(entry, type, 1); + jump_label_transform(entry, type, 1); } diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index edaa30b20841..64b6da95af98 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -44,7 +44,12 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, if (count > node->len - pos) count = node->len - pos; - pa = node->paddr + sizeof(struct setup_data) + pos; + pa = node->paddr + pos; + + /* Is it direct data or invalid indirect one? */ + if (!(node->type & SETUP_INDIRECT) || node->type == SETUP_INDIRECT) + pa += sizeof(struct setup_data); + p = memremap(pa, count, MEMREMAP_WB); if (!p) return -ENOMEM; @@ -108,9 +113,17 @@ static int __init create_setup_data_nodes(struct dentry *parent) goto err_dir; } - node->paddr = pa_data; - node->type = data->type; - node->len = data->len; + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { + node->paddr = ((struct setup_indirect *)data->data)->addr; + node->type = ((struct setup_indirect *)data->data)->type; + node->len = ((struct setup_indirect *)data->data)->len; + } else { + node->paddr = pa_data; + node->type = data->type; + node->len = data->len; + } + create_setup_data_node(d, no, node); pa_data = data->next; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 5ebcd02cbca7..f293d872602a 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -177,9 +177,10 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, * acpi_rsdp=<addr> on kernel command line to make second kernel boot * without efi. */ - if (efi_enabled(EFI_OLD_MEMMAP)) + if (efi_have_uv1_memmap()) return 0; + params->secure_boot = boot_params.secure_boot; ei->efi_loader_signature = current_ei->efi_loader_signature; ei->efi_systab = current_ei->efi_systab; ei->efi_systab_hi = current_ei->efi_systab_hi; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 23297ea64f5f..c44fe7d8d9a4 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -416,7 +416,7 @@ static void kgdb_disable_hw_debug(struct pt_regs *regs) */ void kgdb_roundup_cpus(void) { - apic->send_IPI_allbutself(APIC_DM_NMI); + apic_send_IPI_allbutself(NMI_VECTOR); } #endif diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 0e0b08008b5a..4d7022a740ab 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -40,6 +40,7 @@ #include <linux/frame.h> #include <linux/kasan.h> #include <linux/moduleloader.h> +#include <linux/vmalloc.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> @@ -119,14 +120,14 @@ __synthesize_relative_insn(void *dest, void *from, void *to, u8 op) /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ void synthesize_reljump(void *dest, void *from, void *to) { - __synthesize_relative_insn(dest, from, to, RELATIVEJUMP_OPCODE); + __synthesize_relative_insn(dest, from, to, JMP32_INSN_OPCODE); } NOKPROBE_SYMBOL(synthesize_reljump); /* Insert a call instruction at address 'from', which calls address 'to'.*/ void synthesize_relcall(void *dest, void *from, void *to) { - __synthesize_relative_insn(dest, from, to, RELATIVECALL_OPCODE); + __synthesize_relative_insn(dest, from, to, CALL_INSN_OPCODE); } NOKPROBE_SYMBOL(synthesize_relcall); @@ -301,7 +302,7 @@ static int can_probe(unsigned long paddr) * Another debugging subsystem might insert this breakpoint. * In that case, we can't recover it. */ - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) return 0; addr += insn.length; } @@ -351,8 +352,12 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) kernel_insn_init(insn, dest, MAX_INSN_SIZE); insn_get_length(insn); + /* We can not probe force emulate prefixed instruction */ + if (insn_has_emulate_prefix(insn)) + return 0; + /* Another subsystem puts a breakpoint, failed to recover */ - if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn->opcode.bytes[0] == INT3_INSN_OPCODE) return 0; /* We should not singlestep on the exception masking instructions */ @@ -396,14 +401,14 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p, int len = insn->length; if (can_boost(insn, p->addr) && - MAX_INSN_SIZE - len >= RELATIVEJUMP_SIZE) { + MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) { /* * These instructions can be executed directly if it * jumps back to correct address. */ synthesize_reljump(buf + len, p->ainsn.insn + len, p->addr + insn->length); - len += RELATIVEJUMP_SIZE; + len += JMP32_INSN_SIZE; p->ainsn.boostable = true; } else { p->ainsn.boostable = false; @@ -497,12 +502,14 @@ int arch_prepare_kprobe(struct kprobe *p) void arch_arm_kprobe(struct kprobe *p) { - text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); + text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1); + text_poke_sync(); } void arch_disarm_kprobe(struct kprobe *p) { text_poke(p->addr, &p->opcode, 1); + text_poke_sync(); } void arch_remove_kprobe(struct kprobe *p) @@ -580,7 +587,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, if (setup_detour_execution(p, regs, reenter)) return; -#if !defined(CONFIG_PREEMPT) +#if !defined(CONFIG_PREEMPTION) if (p->ainsn.boostable && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ if (!reenter) @@ -605,7 +612,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, regs->flags |= X86_EFLAGS_TF; regs->flags &= ~X86_EFLAGS_IF; /* single step inline if the instruction is an int3 */ - if (p->opcode == BREAKPOINT_INSTRUCTION) + if (p->opcode == INT3_INSN_OPCODE) regs->ip = (unsigned long)p->addr; else regs->ip = (unsigned long)p->ainsn.insn; @@ -691,7 +698,7 @@ int kprobe_int3_handler(struct pt_regs *regs) reset_current_kprobe(); return 1; } - } else if (*addr != BREAKPOINT_INSTRUCTION) { + } else if (*addr != INT3_INSN_OPCODE) { /* * The breakpoint instruction was removed right * after we hit it. Another cpu has removed diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 9d4aedece363..3f45b5c43a71 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -38,7 +38,7 @@ unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) long offs; int i; - for (i = 0; i < RELATIVEJUMP_SIZE; i++) { + for (i = 0; i < JMP32_INSN_SIZE; i++) { kp = get_kprobe((void *)addr - i); /* This function only handles jump-optimized kprobe */ if (kp && kprobe_optimized(kp)) { @@ -62,10 +62,10 @@ found: if (addr == (unsigned long)kp->addr) { buf[0] = kp->opcode; - memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE); } else { offs = addr - (unsigned long)kp->addr - 1; - memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs); + memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs); } return (unsigned long)buf; @@ -141,8 +141,6 @@ STACK_FRAME_NON_STANDARD(optprobe_template_func); #define TMPL_END_IDX \ ((long)optprobe_template_end - (long)optprobe_template_entry) -#define INT3_SIZE sizeof(kprobe_opcode_t) - /* Optimized kprobe call back function: called from optinsn */ static void optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) @@ -162,7 +160,7 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) regs->cs |= get_kernel_rpl(); regs->gs = 0; #endif - regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; + regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE; regs->orig_ax = ~0UL; __this_cpu_write(current_kprobe, &op->kp); @@ -179,7 +177,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real) struct insn insn; int len = 0, ret; - while (len < RELATIVEJUMP_SIZE) { + while (len < JMP32_INSN_SIZE) { ret = __copy_instruction(dest + len, src + len, real + len, &insn); if (!ret || !can_boost(&insn, src + len)) return -EINVAL; @@ -271,7 +269,7 @@ static int can_optimize(unsigned long paddr) return 0; /* Check there is enough space for a relative jump. */ - if (size - offset < RELATIVEJUMP_SIZE) + if (size - offset < JMP32_INSN_SIZE) return 0; /* Decode instructions */ @@ -290,15 +288,15 @@ static int can_optimize(unsigned long paddr) kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); /* Another subsystem puts a breakpoint */ - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) return 0; /* Recover address */ insn.kaddr = (void *)addr; insn.next_byte = (void *)(addr + insn.length); /* Check any instructions don't jump into target */ if (insn_is_indirect_jump(&insn) || - insn_jump_into_range(&insn, paddr + INT3_SIZE, - RELATIVE_ADDR_SIZE)) + insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE, + DISP32_SIZE)) return 0; addr += insn.length; } @@ -374,7 +372,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, * Verify if the address gap is in 2GB range, because this uses * a relative jump. */ - rel = (long)slot - (long)op->kp.addr + RELATIVEJUMP_SIZE; + rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE; if (abs(rel) > 0x7fffffff) { ret = -ERANGE; goto err; @@ -401,9 +399,9 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, /* Set returning jmp instruction at the tail of out-of-line buffer */ synthesize_reljump(buf + len, slot + len, (u8 *)op->kp.addr + op->optinsn.size); - len += RELATIVEJUMP_SIZE; + len += JMP32_INSN_SIZE; - /* We have to use text_poke for instuction buffer because it is RO */ + /* We have to use text_poke() for instruction buffer because it is RO */ text_poke(slot, buf, len); ret = 0; out: @@ -416,44 +414,50 @@ err: } /* - * Replace breakpoints (int3) with relative jumps. + * Replace breakpoints (INT3) with relative jumps (JMP.d32). * Caller must call with locking kprobe_mutex and text_mutex. + * + * The caller will have installed a regular kprobe and after that issued + * syncrhonize_rcu_tasks(), this ensures that the instruction(s) that live in + * the 4 bytes after the INT3 are unused and can now be overwritten. */ void arch_optimize_kprobes(struct list_head *oplist) { struct optimized_kprobe *op, *tmp; - u8 insn_buff[RELATIVEJUMP_SIZE]; + u8 insn_buff[JMP32_INSN_SIZE]; list_for_each_entry_safe(op, tmp, oplist, list) { s32 rel = (s32)((long)op->optinsn.insn - - ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + ((long)op->kp.addr + JMP32_INSN_SIZE)); WARN_ON(kprobe_disabled(&op->kp)); /* Backup instructions which will be replaced by jump address */ - memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, - RELATIVE_ADDR_SIZE); + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE, + DISP32_SIZE); - insn_buff[0] = RELATIVEJUMP_OPCODE; + insn_buff[0] = JMP32_INSN_OPCODE; *(s32 *)(&insn_buff[1]) = rel; - text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, - op->optinsn.insn); + text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); list_del_init(&op->list); } } -/* Replace a relative jump with a breakpoint (int3). */ +/* + * Replace a relative jump (JMP.d32) with a breakpoint (INT3). + * + * After that, we can restore the 4 bytes after the INT3 to undo what + * arch_optimize_kprobes() scribbled. This is safe since those bytes will be + * unused once the INT3 lands. + */ void arch_unoptimize_kprobe(struct optimized_kprobe *op) { - u8 insn_buff[RELATIVEJUMP_SIZE]; - - /* Set int3 to first byte for kprobes */ - insn_buff[0] = BREAKPOINT_INSTRUCTION; - memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, - op->optinsn.insn); + arch_arm_kprobe(&op->kp); + text_poke(op->kp.addr + INT3_INSN_SIZE, + op->optinsn.copied_insn, DISP32_SIZE); + text_poke_sync(); } /* diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 7969da939213..d0a19121c6a4 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -100,7 +100,12 @@ static int __init get_setup_data_size(int nr, size_t *size) if (!data) return -ENOMEM; if (nr == i) { - *size = data->len; + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) + *size = ((struct setup_indirect *)data->data)->len; + else + *size = data->len; + memunmap(data); return 0; } @@ -130,7 +135,10 @@ static ssize_t type_show(struct kobject *kobj, if (!data) return -ENOMEM; - ret = sprintf(buf, "0x%x\n", data->type); + if (data->type == SETUP_INDIRECT) + ret = sprintf(buf, "0x%x\n", ((struct setup_indirect *)data->data)->type); + else + ret = sprintf(buf, "0x%x\n", data->type); memunmap(data); return ret; } @@ -142,7 +150,7 @@ static ssize_t setup_data_data_read(struct file *fp, loff_t off, size_t count) { int nr, ret = 0; - u64 paddr; + u64 paddr, len; struct setup_data *data; void *p; @@ -157,19 +165,28 @@ static ssize_t setup_data_data_read(struct file *fp, if (!data) return -ENOMEM; - if (off > data->len) { + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { + paddr = ((struct setup_indirect *)data->data)->addr; + len = ((struct setup_indirect *)data->data)->len; + } else { + paddr += sizeof(*data); + len = data->len; + } + + if (off > len) { ret = -EINVAL; goto out; } - if (count > data->len - off) - count = data->len - off; + if (count > len - off) + count = len - off; if (!count) goto out; ret = count; - p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB); + p = memremap(paddr, len, MEMREMAP_WB); if (!p) { ret = -ENOMEM; goto out; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4ab377c9fffe..d817f255aed8 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -33,6 +33,7 @@ #include <asm/apicdef.h> #include <asm/hypervisor.h> #include <asm/tlb.h> +#include <asm/cpuidle_haltpoll.h> static int kvmapf = 1; @@ -244,17 +245,13 @@ NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); dotraplinkage void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { - enum ctx_state prev_state; - switch (kvm_read_and_reset_pf_reason()) { default: do_page_fault(regs, error_code, address); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ - prev_state = exception_enter(); kvm_async_pf_task_wait((u32)address, !user_mode(regs)); - exception_exit(prev_state); break; case KVM_PV_REASON_PAGE_READY: rcu_irq_enter(); @@ -311,7 +308,7 @@ static void kvm_guest_cpu_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION pa |= KVM_ASYNC_PF_SEND_ALWAYS; #endif pa |= KVM_ASYNC_PF_ENABLED; @@ -502,16 +499,6 @@ static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) __send_ipi_mask(local_mask, vector); } -static void kvm_send_ipi_allbutself(int vector) -{ - kvm_send_ipi_mask_allbutself(cpu_online_mask, vector); -} - -static void kvm_send_ipi_all(int vector) -{ - __send_ipi_mask(cpu_online_mask, vector); -} - /* * Set the IPI entry points */ @@ -519,8 +506,6 @@ static void kvm_setup_pv_ipi(void) { apic->send_IPI_mask = kvm_send_ipi_mask; apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; - apic->send_IPI_allbutself = kvm_send_ipi_allbutself; - apic->send_IPI_all = kvm_send_ipi_all; pr_info("KVM setup pv IPIs\n"); } @@ -705,6 +690,7 @@ unsigned int kvm_arch_para_hints(void) { return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES); } +EXPORT_SYMBOL_GPL(kvm_arch_para_hints); static uint32_t __init kvm_detect(void) { @@ -750,6 +736,9 @@ static __init int kvm_setup_pv_tlb_flush(void) { int cpu; + if (!kvm_para_available() || nopv) + return 0; + if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { @@ -867,3 +856,39 @@ void __init kvm_spinlock_init(void) } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + +static void kvm_disable_host_haltpoll(void *i) +{ + wrmsrl(MSR_KVM_POLL_CONTROL, 0); +} + +static void kvm_enable_host_haltpoll(void *i) +{ + wrmsrl(MSR_KVM_POLL_CONTROL, 1); +} + +void arch_haltpoll_enable(unsigned int cpu) +{ + if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) { + pr_err_once("kvm: host does not support poll control\n"); + pr_err_once("kvm: host upgrade recommended\n"); + return; + } + + /* Enable guest halt poll disables host halt poll */ + smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1); +} +EXPORT_SYMBOL_GPL(arch_haltpoll_enable); + +void arch_haltpoll_disable(unsigned int cpu) +{ + if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) + return; + + /* Enable guest halt poll disables host halt poll */ + smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1); +} +EXPORT_SYMBOL_GPL(arch_haltpoll_disable); +#endif diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index b2463fcb20a8..c57e1ca70fd1 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -28,6 +28,89 @@ #include <asm/desc.h> #include <asm/mmu_context.h> #include <asm/syscalls.h> +#include <asm/pgtable_areas.h> + +/* This is a multiple of PAGE_SIZE. */ +#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) + +static inline void *ldt_slot_va(int slot) +{ + return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); +} + +void load_mm_ldt(struct mm_struct *mm) +{ + struct ldt_struct *ldt; + + /* READ_ONCE synchronizes with smp_store_release */ + ldt = READ_ONCE(mm->context.ldt); + + /* + * Any change to mm->context.ldt is followed by an IPI to all + * CPUs with the mm active. The LDT will not be freed until + * after the IPI is handled by all such CPUs. This means that, + * if the ldt_struct changes before we return, the values we see + * will be safe, and the new values will be loaded before we run + * any user code. + * + * NB: don't try to convert this to use RCU without extreme care. + * We would still need IRQs off, because we don't want to change + * the local LDT after an IPI loaded a newer value than the one + * that we can see. + */ + + if (unlikely(ldt)) { + if (static_cpu_has(X86_FEATURE_PTI)) { + if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { + /* + * Whoops -- either the new LDT isn't mapped + * (if slot == -1) or is mapped into a bogus + * slot (if slot > 1). + */ + clear_LDT(); + return; + } + + /* + * If page table isolation is enabled, ldt->entries + * will not be mapped in the userspace pagetables. + * Tell the CPU to access the LDT through the alias + * at ldt_slot_va(ldt->slot). + */ + set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); + } else { + set_ldt(ldt->entries, ldt->nr_entries); + } + } else { + clear_LDT(); + } +} + +void switch_ldt(struct mm_struct *prev, struct mm_struct *next) +{ + /* + * Load the LDT if either the old or new mm had an LDT. + * + * An mm will never go from having an LDT to not having an LDT. Two + * mms never share an LDT, so we don't gain anything by checking to + * see whether the LDT changed. There's also no guarantee that + * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL, + * then prev->context.ldt will also be non-NULL. + * + * If we really cared, we could optimize the case where prev == next + * and we're exiting lazy mode. Most of the time, if this happens, + * we don't actually need to reload LDTR, but modify_ldt() is mostly + * used by legacy code and emulators where we don't need this level of + * performance. + * + * This uses | instead of || because it generates better code. + */ + if (unlikely((unsigned long)prev->context.ldt | + (unsigned long)next->context.ldt)) + load_mm_ldt(next); + + DEBUG_LOCKS_WARN_ON(preemptible()); +} static void refresh_ldt_segments(void) { diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 77854b192fef..02bddfc122a4 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -148,7 +148,7 @@ int machine_kexec_prepare(struct kimage *image) { int error; - set_pages_x(image->control_code_page, 1); + set_memory_x((unsigned long)page_address(image->control_code_page), 1); error = machine_kexec_alloc_page_tables(image); if (error) return error; @@ -162,7 +162,7 @@ int machine_kexec_prepare(struct kimage *image) */ void machine_kexec_cleanup(struct kimage *image) { - set_pages_nx(image->control_code_page, 1); + set_memory_nx((unsigned long)page_address(image->control_code_page), 1); machine_kexec_free_page_tables(image); } @@ -250,15 +250,3 @@ void machine_kexec(struct kimage *image) __ftrace_enabled_restore(save_ftrace_enabled); } - -void arch_crash_save_vmcoreinfo(void) -{ -#ifdef CONFIG_NUMA - VMCOREINFO_SYMBOL(node_data); - VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); -#endif -#ifdef CONFIG_X86_PAE - VMCOREINFO_CONFIG(X86_PAE); -#endif -} - diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 5dcd438ad8f2..ad5cdd6a5f23 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -298,48 +298,6 @@ static void load_segments(void) ); } -#ifdef CONFIG_KEXEC_FILE -/* Update purgatory as needed after various image segments have been prepared */ -static int arch_update_purgatory(struct kimage *image) -{ - int ret = 0; - - if (!image->file_mode) - return 0; - - /* Setup copying of backup region */ - if (image->type == KEXEC_TYPE_CRASH) { - ret = kexec_purgatory_get_set_symbol(image, - "purgatory_backup_dest", - &image->arch.backup_load_addr, - sizeof(image->arch.backup_load_addr), 0); - if (ret) - return ret; - - ret = kexec_purgatory_get_set_symbol(image, - "purgatory_backup_src", - &image->arch.backup_src_start, - sizeof(image->arch.backup_src_start), 0); - if (ret) - return ret; - - ret = kexec_purgatory_get_set_symbol(image, - "purgatory_backup_sz", - &image->arch.backup_src_sz, - sizeof(image->arch.backup_src_sz), 0); - if (ret) - return ret; - } - - return ret; -} -#else /* !CONFIG_KEXEC_FILE */ -static inline int arch_update_purgatory(struct kimage *image) -{ - return 0; -} -#endif /* CONFIG_KEXEC_FILE */ - int machine_kexec_prepare(struct kimage *image) { unsigned long start_pgtable; @@ -353,11 +311,6 @@ int machine_kexec_prepare(struct kimage *image) if (result) return result; - /* update purgatory as needed */ - result = arch_update_purgatory(image); - if (result) - return result; - return 0; } @@ -445,25 +398,6 @@ void machine_kexec(struct kimage *image) __ftrace_enabled_restore(save_ftrace_enabled); } -void arch_crash_save_vmcoreinfo(void) -{ - u64 sme_mask = sme_me_mask; - - VMCOREINFO_NUMBER(phys_base); - VMCOREINFO_SYMBOL(init_top_pgt); - vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n", - pgtable_l5_enabled()); - -#ifdef CONFIG_NUMA - VMCOREINFO_SYMBOL(node_data); - VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); -#endif - vmcoreinfo_append_str("KERNELOFFSET=%lx\n", - kaslr_offset()); - VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); - VMCOREINFO_NUMBER(sme_mask); -} - /* arch-dependent functionality related to kexec file-based syscall */ #ifdef CONFIG_KEXEC_FILE diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 3db2252b958d..1547be359d7f 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -34,6 +34,7 @@ #include <linux/notifier.h> #include <linux/uaccess.h> #include <linux/gfp.h> +#include <linux/security.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -79,6 +80,10 @@ static ssize_t msr_write(struct file *file, const char __user *buf, int err = 0; ssize_t bytes = 0; + err = security_locked_down(LOCKDOWN_MSR); + if (err) + return err; + if (count % 8) return -EINVAL; /* Invalid chunk size */ @@ -130,6 +135,9 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) err = -EFAULT; break; } + err = security_locked_down(LOCKDOWN_MSR); + if (err) + break; err = wrmsr_safe_regs_on_cpu(cpu, regs); if (err) break; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 4df7705022b9..54c21d6abd5a 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -104,18 +104,22 @@ static int __init nmi_warning_debugfs(void) } fs_initcall(nmi_warning_debugfs); -static void nmi_max_handler(struct irq_work *w) +static void nmi_check_duration(struct nmiaction *action, u64 duration) { - struct nmiaction *a = container_of(w, struct nmiaction, irq_work); + u64 whole_msecs = READ_ONCE(action->max_duration); int remainder_ns, decimal_msecs; - u64 whole_msecs = READ_ONCE(a->max_duration); + + if (duration < nmi_longest_ns || duration < action->max_duration) + return; + + action->max_duration = duration; remainder_ns = do_div(whole_msecs, (1000 * 1000)); decimal_msecs = remainder_ns / 1000; printk_ratelimited(KERN_INFO "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", - a->handler, whole_msecs, decimal_msecs); + action->handler, whole_msecs, decimal_msecs); } static int nmi_handle(unsigned int type, struct pt_regs *regs) @@ -142,11 +146,7 @@ static int nmi_handle(unsigned int type, struct pt_regs *regs) delta = sched_clock() - delta; trace_nmi_handler(a->handler, (int)delta, thishandled); - if (delta < nmi_longest_ns || delta < a->max_duration) - continue; - - a->max_duration = delta; - irq_work_queue(&a->irq_work); + nmi_check_duration(a, delta); } rcu_read_unlock(); @@ -164,8 +164,6 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) if (!action->handler) return -EINVAL; - init_irq_work(&action->irq_work, nmi_max_handler); - raw_spin_lock_irqsave(&desc->lock, flags); /* @@ -512,6 +510,9 @@ NOKPROBE_SYMBOL(is_debug_stack); dotraplinkage notrace void do_nmi(struct pt_regs *regs, long error_code) { + if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id())) + return; + if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { this_cpu_write(nmi_state, NMI_LATCHED); return; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 0aa6256eedd8..789f5e4f89de 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -311,10 +311,6 @@ struct paravirt_patch_template pv_ops = { .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, -#ifdef CONFIG_X86_64 - .cpu.read_cr8 = native_read_cr8, - .cpu.write_cr8 = native_write_cr8, -#endif .cpu.wbinvd = native_wbinvd, .cpu.read_msr = native_read_msr, .cpu.write_msr = native_write_msr, @@ -345,8 +341,6 @@ struct paravirt_patch_template pv_ops = { .cpu.iret = native_iret, .cpu.swapgs = native_swapgs, - .cpu.set_iopl_mask = native_set_iopl_mask, - .cpu.start_context_switch = paravirt_nop, .cpu.end_context_switch = paravirt_nop, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c deleted file mode 100644 index 9d4343aa481b..000000000000 --- a/arch/x86/kernel/pci-calgary_64.c +++ /dev/null @@ -1,1584 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Derived from arch/powerpc/kernel/iommu.c - * - * Copyright IBM Corporation, 2006-2007 - * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us> - * - * Author: Jon Mason <jdmason@kudzu.us> - * Author: Muli Ben-Yehuda <muli@il.ibm.com> - - */ - -#define pr_fmt(fmt) "Calgary: " fmt - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/string.h> -#include <linux/crash_dump.h> -#include <linux/dma-mapping.h> -#include <linux/dma-direct.h> -#include <linux/bitmap.h> -#include <linux/pci_ids.h> -#include <linux/pci.h> -#include <linux/delay.h> -#include <linux/scatterlist.h> -#include <linux/iommu-helper.h> - -#include <asm/iommu.h> -#include <asm/calgary.h> -#include <asm/tce.h> -#include <asm/pci-direct.h> -#include <asm/dma.h> -#include <asm/rio.h> -#include <asm/bios_ebda.h> -#include <asm/x86_init.h> -#include <asm/iommu_table.h> - -#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT -int use_calgary __read_mostly = 1; -#else -int use_calgary __read_mostly = 0; -#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */ - -#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 -#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308 - -/* register offsets inside the host bridge space */ -#define CALGARY_CONFIG_REG 0x0108 -#define PHB_CSR_OFFSET 0x0110 /* Channel Status */ -#define PHB_PLSSR_OFFSET 0x0120 -#define PHB_CONFIG_RW_OFFSET 0x0160 -#define PHB_IOBASE_BAR_LOW 0x0170 -#define PHB_IOBASE_BAR_HIGH 0x0180 -#define PHB_MEM_1_LOW 0x0190 -#define PHB_MEM_1_HIGH 0x01A0 -#define PHB_IO_ADDR_SIZE 0x01B0 -#define PHB_MEM_1_SIZE 0x01C0 -#define PHB_MEM_ST_OFFSET 0x01D0 -#define PHB_AER_OFFSET 0x0200 -#define PHB_CONFIG_0_HIGH 0x0220 -#define PHB_CONFIG_0_LOW 0x0230 -#define PHB_CONFIG_0_END 0x0240 -#define PHB_MEM_2_LOW 0x02B0 -#define PHB_MEM_2_HIGH 0x02C0 -#define PHB_MEM_2_SIZE_HIGH 0x02D0 -#define PHB_MEM_2_SIZE_LOW 0x02E0 -#define PHB_DOSHOLE_OFFSET 0x08E0 - -/* CalIOC2 specific */ -#define PHB_SAVIOR_L2 0x0DB0 -#define PHB_PAGE_MIG_CTRL 0x0DA8 -#define PHB_PAGE_MIG_DEBUG 0x0DA0 -#define PHB_ROOT_COMPLEX_STATUS 0x0CB0 - -/* PHB_CONFIG_RW */ -#define PHB_TCE_ENABLE 0x20000000 -#define PHB_SLOT_DISABLE 0x1C000000 -#define PHB_DAC_DISABLE 0x01000000 -#define PHB_MEM2_ENABLE 0x00400000 -#define PHB_MCSR_ENABLE 0x00100000 -/* TAR (Table Address Register) */ -#define TAR_SW_BITS 0x0000ffffffff800fUL -#define TAR_VALID 0x0000000000000008UL -/* CSR (Channel/DMA Status Register) */ -#define CSR_AGENT_MASK 0xffe0ffff -/* CCR (Calgary Configuration Register) */ -#define CCR_2SEC_TIMEOUT 0x000000000000000EUL -/* PMCR/PMDR (Page Migration Control/Debug Registers */ -#define PMR_SOFTSTOP 0x80000000 -#define PMR_SOFTSTOPFAULT 0x40000000 -#define PMR_HARDSTOP 0x20000000 - -/* - * The maximum PHB bus number. - * x3950M2 (rare): 8 chassis, 48 PHBs per chassis = 384 - * x3950M2: 4 chassis, 48 PHBs per chassis = 192 - * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256 - * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128 - */ -#define MAX_PHB_BUS_NUM 256 - -#define PHBS_PER_CALGARY 4 - -/* register offsets in Calgary's internal register space */ -static const unsigned long tar_offsets[] = { - 0x0580 /* TAR0 */, - 0x0588 /* TAR1 */, - 0x0590 /* TAR2 */, - 0x0598 /* TAR3 */ -}; - -static const unsigned long split_queue_offsets[] = { - 0x4870 /* SPLIT QUEUE 0 */, - 0x5870 /* SPLIT QUEUE 1 */, - 0x6870 /* SPLIT QUEUE 2 */, - 0x7870 /* SPLIT QUEUE 3 */ -}; - -static const unsigned long phb_offsets[] = { - 0x8000 /* PHB0 */, - 0x9000 /* PHB1 */, - 0xA000 /* PHB2 */, - 0xB000 /* PHB3 */ -}; - -/* PHB debug registers */ - -static const unsigned long phb_debug_offsets[] = { - 0x4000 /* PHB 0 DEBUG */, - 0x5000 /* PHB 1 DEBUG */, - 0x6000 /* PHB 2 DEBUG */, - 0x7000 /* PHB 3 DEBUG */ -}; - -/* - * STUFF register for each debug PHB, - * byte 1 = start bus number, byte 2 = end bus number - */ - -#define PHB_DEBUG_STUFF_OFFSET 0x0020 - -unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; -static int translate_empty_slots __read_mostly = 0; -static int calgary_detected __read_mostly = 0; - -static struct rio_table_hdr *rio_table_hdr __initdata; -static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; -static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata; - -struct calgary_bus_info { - void *tce_space; - unsigned char translation_disabled; - signed char phbid; - void __iomem *bbar; -}; - -static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); -static void calgary_tce_cache_blast(struct iommu_table *tbl); -static void calgary_dump_error_regs(struct iommu_table *tbl); -static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); -static void calioc2_tce_cache_blast(struct iommu_table *tbl); -static void calioc2_dump_error_regs(struct iommu_table *tbl); -static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl); -static void get_tce_space_from_tar(void); - -static const struct cal_chipset_ops calgary_chip_ops = { - .handle_quirks = calgary_handle_quirks, - .tce_cache_blast = calgary_tce_cache_blast, - .dump_error_regs = calgary_dump_error_regs -}; - -static const struct cal_chipset_ops calioc2_chip_ops = { - .handle_quirks = calioc2_handle_quirks, - .tce_cache_blast = calioc2_tce_cache_blast, - .dump_error_regs = calioc2_dump_error_regs -}; - -static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; - -static inline int translation_enabled(struct iommu_table *tbl) -{ - /* only PHBs with translation enabled have an IOMMU table */ - return (tbl != NULL); -} - -static void iommu_range_reserve(struct iommu_table *tbl, - unsigned long start_addr, unsigned int npages) -{ - unsigned long index; - unsigned long end; - unsigned long flags; - - index = start_addr >> PAGE_SHIFT; - - /* bail out if we're asked to reserve a region we don't cover */ - if (index >= tbl->it_size) - return; - - end = index + npages; - if (end > tbl->it_size) /* don't go off the table */ - end = tbl->it_size; - - spin_lock_irqsave(&tbl->it_lock, flags); - - bitmap_set(tbl->it_map, index, npages); - - spin_unlock_irqrestore(&tbl->it_lock, flags); -} - -static unsigned long iommu_range_alloc(struct device *dev, - struct iommu_table *tbl, - unsigned int npages) -{ - unsigned long flags; - unsigned long offset; - unsigned long boundary_size; - - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; - - BUG_ON(npages == 0); - - spin_lock_irqsave(&tbl->it_lock, flags); - - offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint, - npages, 0, boundary_size, 0); - if (offset == ~0UL) { - tbl->chip_ops->tce_cache_blast(tbl); - - offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, - npages, 0, boundary_size, 0); - if (offset == ~0UL) { - pr_warn("IOMMU full\n"); - spin_unlock_irqrestore(&tbl->it_lock, flags); - if (panic_on_overflow) - panic("Calgary: fix the allocator.\n"); - else - return DMA_MAPPING_ERROR; - } - } - - tbl->it_hint = offset + npages; - BUG_ON(tbl->it_hint > tbl->it_size); - - spin_unlock_irqrestore(&tbl->it_lock, flags); - - return offset; -} - -static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, - void *vaddr, unsigned int npages, int direction) -{ - unsigned long entry; - dma_addr_t ret; - - entry = iommu_range_alloc(dev, tbl, npages); - if (unlikely(entry == DMA_MAPPING_ERROR)) { - pr_warn("failed to allocate %u pages in iommu %p\n", - npages, tbl); - return DMA_MAPPING_ERROR; - } - - /* set the return dma address */ - ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); - - /* put the TCEs in the HW table */ - tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, - direction); - return ret; -} - -static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages) -{ - unsigned long entry; - unsigned long flags; - - /* were we called with bad_dma_address? */ - if (unlikely(dma_addr == DMA_MAPPING_ERROR)) { - WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " - "address 0x%Lx\n", dma_addr); - return; - } - - entry = dma_addr >> PAGE_SHIFT; - - BUG_ON(entry + npages > tbl->it_size); - - tce_free(tbl, entry, npages); - - spin_lock_irqsave(&tbl->it_lock, flags); - - bitmap_clear(tbl->it_map, entry, npages); - - spin_unlock_irqrestore(&tbl->it_lock, flags); -} - -static inline struct iommu_table *find_iommu_table(struct device *dev) -{ - struct pci_dev *pdev; - struct pci_bus *pbus; - struct iommu_table *tbl; - - pdev = to_pci_dev(dev); - - /* search up the device tree for an iommu */ - pbus = pdev->bus; - do { - tbl = pci_iommu(pbus); - if (tbl && tbl->it_busno == pbus->number) - break; - tbl = NULL; - pbus = pbus->parent; - } while (pbus); - - BUG_ON(tbl && (tbl->it_busno != pbus->number)); - - return tbl; -} - -static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, - int nelems,enum dma_data_direction dir, - unsigned long attrs) -{ - struct iommu_table *tbl = find_iommu_table(dev); - struct scatterlist *s; - int i; - - if (!translation_enabled(tbl)) - return; - - for_each_sg(sglist, s, nelems, i) { - unsigned int npages; - dma_addr_t dma = s->dma_address; - unsigned int dmalen = s->dma_length; - - if (dmalen == 0) - break; - - npages = iommu_num_pages(dma, dmalen, PAGE_SIZE); - iommu_free(tbl, dma, npages); - } -} - -static int calgary_map_sg(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir, - unsigned long attrs) -{ - struct iommu_table *tbl = find_iommu_table(dev); - struct scatterlist *s; - unsigned long vaddr; - unsigned int npages; - unsigned long entry; - int i; - - for_each_sg(sg, s, nelems, i) { - BUG_ON(!sg_page(s)); - - vaddr = (unsigned long) sg_virt(s); - npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); - - entry = iommu_range_alloc(dev, tbl, npages); - if (entry == DMA_MAPPING_ERROR) { - /* makes sure unmap knows to stop */ - s->dma_length = 0; - goto error; - } - - s->dma_address = (entry << PAGE_SHIFT) | s->offset; - - /* insert into HW table */ - tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir); - - s->dma_length = s->length; - } - - return nelems; -error: - calgary_unmap_sg(dev, sg, nelems, dir, 0); - for_each_sg(sg, s, nelems, i) { - sg->dma_address = DMA_MAPPING_ERROR; - sg->dma_length = 0; - } - return 0; -} - -static dma_addr_t calgary_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - void *vaddr = page_address(page) + offset; - unsigned long uaddr; - unsigned int npages; - struct iommu_table *tbl = find_iommu_table(dev); - - uaddr = (unsigned long)vaddr; - npages = iommu_num_pages(uaddr, size, PAGE_SIZE); - - return iommu_alloc(dev, tbl, vaddr, npages, dir); -} - -static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - struct iommu_table *tbl = find_iommu_table(dev); - unsigned int npages; - - npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); - iommu_free(tbl, dma_addr, npages); -} - -static void* calgary_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) -{ - void *ret = NULL; - dma_addr_t mapping; - unsigned int npages, order; - struct iommu_table *tbl = find_iommu_table(dev); - - size = PAGE_ALIGN(size); /* size rounded up to full pages */ - npages = size >> PAGE_SHIFT; - order = get_order(size); - - /* alloc enough pages (and possibly more) */ - ret = (void *)__get_free_pages(flag, order); - if (!ret) - goto error; - memset(ret, 0, size); - - /* set up tces to cover the allocated range */ - mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == DMA_MAPPING_ERROR) - goto free; - *dma_handle = mapping; - return ret; -free: - free_pages((unsigned long)ret, get_order(size)); - ret = NULL; -error: - return ret; -} - -static void calgary_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ - unsigned int npages; - struct iommu_table *tbl = find_iommu_table(dev); - - size = PAGE_ALIGN(size); - npages = size >> PAGE_SHIFT; - - iommu_free(tbl, dma_handle, npages); - free_pages((unsigned long)vaddr, get_order(size)); -} - -static const struct dma_map_ops calgary_dma_ops = { - .alloc = calgary_alloc_coherent, - .free = calgary_free_coherent, - .map_sg = calgary_map_sg, - .unmap_sg = calgary_unmap_sg, - .map_page = calgary_map_page, - .unmap_page = calgary_unmap_page, - .dma_supported = dma_direct_supported, -}; - -static inline void __iomem * busno_to_bbar(unsigned char num) -{ - return bus_info[num].bbar; -} - -static inline int busno_to_phbid(unsigned char num) -{ - return bus_info[num].phbid; -} - -static inline unsigned long split_queue_offset(unsigned char num) -{ - size_t idx = busno_to_phbid(num); - - return split_queue_offsets[idx]; -} - -static inline unsigned long tar_offset(unsigned char num) -{ - size_t idx = busno_to_phbid(num); - - return tar_offsets[idx]; -} - -static inline unsigned long phb_offset(unsigned char num) -{ - size_t idx = busno_to_phbid(num); - - return phb_offsets[idx]; -} - -static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset) -{ - unsigned long target = ((unsigned long)bar) | offset; - return (void __iomem*)target; -} - -static inline int is_calioc2(unsigned short device) -{ - return (device == PCI_DEVICE_ID_IBM_CALIOC2); -} - -static inline int is_calgary(unsigned short device) -{ - return (device == PCI_DEVICE_ID_IBM_CALGARY); -} - -static inline int is_cal_pci_dev(unsigned short device) -{ - return (is_calgary(device) || is_calioc2(device)); -} - -static void calgary_tce_cache_blast(struct iommu_table *tbl) -{ - u64 val; - u32 aer; - int i = 0; - void __iomem *bbar = tbl->bbar; - void __iomem *target; - - /* disable arbitration on the bus */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); - aer = readl(target); - writel(0, target); - - /* read plssr to ensure it got there */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); - val = readl(target); - - /* poll split queues until all DMA activity is done */ - target = calgary_reg(bbar, split_queue_offset(tbl->it_busno)); - do { - val = readq(target); - i++; - } while ((val & 0xff) != 0xff && i < 100); - if (i == 100) - pr_warn("PCI bus not quiesced, continuing anyway\n"); - - /* invalidate TCE cache */ - target = calgary_reg(bbar, tar_offset(tbl->it_busno)); - writeq(tbl->tar_val, target); - - /* enable arbitration */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); - writel(aer, target); - (void)readl(target); /* flush */ -} - -static void calioc2_tce_cache_blast(struct iommu_table *tbl) -{ - void __iomem *bbar = tbl->bbar; - void __iomem *target; - u64 val64; - u32 val; - int i = 0; - int count = 1; - unsigned char bus = tbl->it_busno; - -begin: - printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast " - "sequence - count %d\n", bus, count); - - /* 1. using the Page Migration Control reg set SoftStop */ - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target); - val |= PMR_SOFTSTOP; - printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target); - writel(cpu_to_be32(val), target); - - /* 2. poll split queues until all DMA activity is done */ - printk(KERN_DEBUG "2a. starting to poll split queues\n"); - target = calgary_reg(bbar, split_queue_offset(bus)); - do { - val64 = readq(target); - i++; - } while ((val64 & 0xff) != 0xff && i < 100); - if (i == 100) - pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n"); - - /* 3. poll Page Migration DEBUG for SoftStopFault */ - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target); - - /* 4. if SoftStopFault - goto (1) */ - if (val & PMR_SOFTSTOPFAULT) { - if (++count < 100) - goto begin; - else { - pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n"); - return; /* pray for the best */ - } - } - - /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */ - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target); - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target); - - /* 6. invalidate TCE cache */ - printk(KERN_DEBUG "6. invalidating TCE cache\n"); - target = calgary_reg(bbar, tar_offset(bus)); - writeq(tbl->tar_val, target); - - /* 7. Re-read PMCR */ - printk(KERN_DEBUG "7a. Re-reading PMCR\n"); - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target); - - /* 8. Remove HardStop */ - printk(KERN_DEBUG "8a. removing HardStop from PMCR\n"); - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - val = 0; - printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target); - writel(cpu_to_be32(val), target); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target); -} - -static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, - u64 limit) -{ - unsigned int numpages; - - limit = limit | 0xfffff; - limit++; - - numpages = ((limit - start) >> PAGE_SHIFT); - iommu_range_reserve(pci_iommu(dev->bus), start, numpages); -} - -static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) -{ - void __iomem *target; - u64 low, high, sizelow; - u64 start, limit; - struct iommu_table *tbl = pci_iommu(dev->bus); - unsigned char busnum = dev->bus->number; - void __iomem *bbar = tbl->bbar; - - /* peripheral MEM_1 region */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW); - low = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH); - high = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE); - sizelow = be32_to_cpu(readl(target)); - - start = (high << 32) | low; - limit = sizelow; - - calgary_reserve_mem_region(dev, start, limit); -} - -static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) -{ - void __iomem *target; - u32 val32; - u64 low, high, sizelow, sizehigh; - u64 start, limit; - struct iommu_table *tbl = pci_iommu(dev->bus); - unsigned char busnum = dev->bus->number; - void __iomem *bbar = tbl->bbar; - - /* is it enabled? */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - if (!(val32 & PHB_MEM2_ENABLE)) - return; - - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW); - low = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH); - high = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW); - sizelow = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH); - sizehigh = be32_to_cpu(readl(target)); - - start = (high << 32) | low; - limit = (sizehigh << 32) | sizelow; - - calgary_reserve_mem_region(dev, start, limit); -} - -/* - * some regions of the IO address space do not get translated, so we - * must not give devices IO addresses in those regions. The regions - * are the 640KB-1MB region and the two PCI peripheral memory holes. - * Reserve all of them in the IOMMU bitmap to avoid giving them out - * later. - */ -static void __init calgary_reserve_regions(struct pci_dev *dev) -{ - unsigned int npages; - u64 start; - struct iommu_table *tbl = pci_iommu(dev->bus); - - /* avoid the BIOS/VGA first 640KB-1MB region */ - /* for CalIOC2 - avoid the entire first MB */ - if (is_calgary(dev->device)) { - start = (640 * 1024); - npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; - } else { /* calioc2 */ - start = 0; - npages = (1 * 1024 * 1024) >> PAGE_SHIFT; - } - iommu_range_reserve(tbl, start, npages); - - /* reserve the two PCI peripheral memory regions in IO space */ - calgary_reserve_peripheral_mem_1(dev); - calgary_reserve_peripheral_mem_2(dev); -} - -static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) -{ - u64 val64; - u64 table_phys; - void __iomem *target; - int ret; - struct iommu_table *tbl; - - /* build TCE tables for each PHB */ - ret = build_tce_table(dev, bbar); - if (ret) - return ret; - - tbl = pci_iommu(dev->bus); - tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; - - if (is_kdump_kernel()) - calgary_init_bitmap_from_tce_table(tbl); - else - tce_free(tbl, 0, tbl->it_size); - - if (is_calgary(dev->device)) - tbl->chip_ops = &calgary_chip_ops; - else if (is_calioc2(dev->device)) - tbl->chip_ops = &calioc2_chip_ops; - else - BUG(); - - calgary_reserve_regions(dev); - - /* set TARs for each PHB */ - target = calgary_reg(bbar, tar_offset(dev->bus->number)); - val64 = be64_to_cpu(readq(target)); - - /* zero out all TAR bits under sw control */ - val64 &= ~TAR_SW_BITS; - table_phys = (u64)__pa(tbl->it_base); - - val64 |= table_phys; - - BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M); - val64 |= (u64) specified_table_size; - - tbl->tar_val = cpu_to_be64(val64); - - writeq(tbl->tar_val, target); - readq(target); /* flush */ - - return 0; -} - -static void __init calgary_free_bus(struct pci_dev *dev) -{ - u64 val64; - struct iommu_table *tbl = pci_iommu(dev->bus); - void __iomem *target; - unsigned int bitmapsz; - - target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); - val64 = be64_to_cpu(readq(target)); - val64 &= ~TAR_SW_BITS; - writeq(cpu_to_be64(val64), target); - readq(target); /* flush */ - - bitmapsz = tbl->it_size / BITS_PER_BYTE; - free_pages((unsigned long)tbl->it_map, get_order(bitmapsz)); - tbl->it_map = NULL; - - kfree(tbl); - - set_pci_iommu(dev->bus, NULL); - - /* Can't free bootmem allocated memory after system is up :-( */ - bus_info[dev->bus->number].tce_space = NULL; -} - -static void calgary_dump_error_regs(struct iommu_table *tbl) -{ - void __iomem *bbar = tbl->bbar; - void __iomem *target; - u32 csr, plssr; - - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); - csr = be32_to_cpu(readl(target)); - - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); - plssr = be32_to_cpu(readl(target)); - - /* If no error, the agent ID in the CSR is not valid */ - pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n", - tbl->it_busno, csr, plssr); -} - -static void calioc2_dump_error_regs(struct iommu_table *tbl) -{ - void __iomem *bbar = tbl->bbar; - u32 csr, csmr, plssr, mck, rcstat; - void __iomem *target; - unsigned long phboff = phb_offset(tbl->it_busno); - unsigned long erroff; - u32 errregs[7]; - int i; - - /* dump CSR */ - target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET); - csr = be32_to_cpu(readl(target)); - /* dump PLSSR */ - target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET); - plssr = be32_to_cpu(readl(target)); - /* dump CSMR */ - target = calgary_reg(bbar, phboff | 0x290); - csmr = be32_to_cpu(readl(target)); - /* dump mck */ - target = calgary_reg(bbar, phboff | 0x800); - mck = be32_to_cpu(readl(target)); - - pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno); - - pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", - csr, plssr, csmr, mck); - - /* dump rest of error regs */ - pr_emerg(""); - for (i = 0; i < ARRAY_SIZE(errregs); i++) { - /* err regs are at 0x810 - 0x870 */ - erroff = (0x810 + (i * 0x10)); - target = calgary_reg(bbar, phboff | erroff); - errregs[i] = be32_to_cpu(readl(target)); - pr_cont("0x%08x@0x%lx ", errregs[i], erroff); - } - pr_cont("\n"); - - /* root complex status */ - target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); - rcstat = be32_to_cpu(readl(target)); - printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat, - PHB_ROOT_COMPLEX_STATUS); -} - -static void calgary_watchdog(struct timer_list *t) -{ - struct iommu_table *tbl = from_timer(tbl, t, watchdog_timer); - void __iomem *bbar = tbl->bbar; - u32 val32; - void __iomem *target; - - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); - val32 = be32_to_cpu(readl(target)); - - /* If no error, the agent ID in the CSR is not valid */ - if (val32 & CSR_AGENT_MASK) { - tbl->chip_ops->dump_error_regs(tbl); - - /* reset error */ - writel(0, target); - - /* Disable bus that caused the error */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | - PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - val32 |= PHB_SLOT_DISABLE; - writel(cpu_to_be32(val32), target); - readl(target); /* flush */ - } else { - /* Reset the timer */ - mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ); - } -} - -static void __init calgary_set_split_completion_timeout(void __iomem *bbar, - unsigned char busnum, unsigned long timeout) -{ - u64 val64; - void __iomem *target; - unsigned int phb_shift = ~0; /* silence gcc */ - u64 mask; - - switch (busno_to_phbid(busnum)) { - case 0: phb_shift = (63 - 19); - break; - case 1: phb_shift = (63 - 23); - break; - case 2: phb_shift = (63 - 27); - break; - case 3: phb_shift = (63 - 35); - break; - default: - BUG_ON(busno_to_phbid(busnum)); - } - - target = calgary_reg(bbar, CALGARY_CONFIG_REG); - val64 = be64_to_cpu(readq(target)); - - /* zero out this PHB's timer bits */ - mask = ~(0xFUL << phb_shift); - val64 &= mask; - val64 |= (timeout << phb_shift); - writeq(cpu_to_be64(val64), target); - readq(target); /* flush */ -} - -static void __init calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) -{ - unsigned char busnum = dev->bus->number; - void __iomem *bbar = tbl->bbar; - void __iomem *target; - u32 val; - - /* - * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1 - */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2); - val = cpu_to_be32(readl(target)); - val |= 0x00800000; - writel(cpu_to_be32(val), target); -} - -static void __init calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) -{ - unsigned char busnum = dev->bus->number; - - /* - * Give split completion a longer timeout on bus 1 for aic94xx - * http://bugzilla.kernel.org/show_bug.cgi?id=7180 - */ - if (is_calgary(dev->device) && (busnum == 1)) - calgary_set_split_completion_timeout(tbl->bbar, busnum, - CCR_2SEC_TIMEOUT); -} - -static void __init calgary_enable_translation(struct pci_dev *dev) -{ - u32 val32; - unsigned char busnum; - void __iomem *target; - void __iomem *bbar; - struct iommu_table *tbl; - - busnum = dev->bus->number; - tbl = pci_iommu(dev->bus); - bbar = tbl->bbar; - - /* enable TCE in PHB Config Register */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE; - - printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n", - (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ? - "Calgary" : "CalIOC2", busnum); - printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this " - "bus.\n"); - - writel(cpu_to_be32(val32), target); - readl(target); /* flush */ - - timer_setup(&tbl->watchdog_timer, calgary_watchdog, 0); - mod_timer(&tbl->watchdog_timer, jiffies); -} - -static void __init calgary_disable_translation(struct pci_dev *dev) -{ - u32 val32; - unsigned char busnum; - void __iomem *target; - void __iomem *bbar; - struct iommu_table *tbl; - - busnum = dev->bus->number; - tbl = pci_iommu(dev->bus); - bbar = tbl->bbar; - - /* disable TCE in PHB Config Register */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE); - - printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum); - writel(cpu_to_be32(val32), target); - readl(target); /* flush */ - - del_timer_sync(&tbl->watchdog_timer); -} - -static void __init calgary_init_one_nontraslated(struct pci_dev *dev) -{ - pci_dev_get(dev); - set_pci_iommu(dev->bus, NULL); - - /* is the device behind a bridge? */ - if (dev->bus->parent) - dev->bus->parent->self = dev; - else - dev->bus->self = dev; -} - -static int __init calgary_init_one(struct pci_dev *dev) -{ - void __iomem *bbar; - struct iommu_table *tbl; - int ret; - - bbar = busno_to_bbar(dev->bus->number); - ret = calgary_setup_tar(dev, bbar); - if (ret) - goto done; - - pci_dev_get(dev); - - if (dev->bus->parent) { - if (dev->bus->parent->self) - printk(KERN_WARNING "Calgary: IEEEE, dev %p has " - "bus->parent->self!\n", dev); - dev->bus->parent->self = dev; - } else - dev->bus->self = dev; - - tbl = pci_iommu(dev->bus); - tbl->chip_ops->handle_quirks(tbl, dev); - - calgary_enable_translation(dev); - - return 0; - -done: - return ret; -} - -static int __init calgary_locate_bbars(void) -{ - int ret; - int rioidx, phb, bus; - void __iomem *bbar; - void __iomem *target; - unsigned long offset; - u8 start_bus, end_bus; - u32 val; - - ret = -ENODATA; - for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) { - struct rio_detail *rio = rio_devs[rioidx]; - - if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY)) - continue; - - /* map entire 1MB of Calgary config space */ - bbar = ioremap_nocache(rio->BBAR, 1024 * 1024); - if (!bbar) - goto error; - - for (phb = 0; phb < PHBS_PER_CALGARY; phb++) { - offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET; - target = calgary_reg(bbar, offset); - - val = be32_to_cpu(readl(target)); - - start_bus = (u8)((val & 0x00FF0000) >> 16); - end_bus = (u8)((val & 0x0000FF00) >> 8); - - if (end_bus) { - for (bus = start_bus; bus <= end_bus; bus++) { - bus_info[bus].bbar = bbar; - bus_info[bus].phbid = phb; - } - } else { - bus_info[start_bus].bbar = bbar; - bus_info[start_bus].phbid = phb; - } - } - } - - return 0; - -error: - /* scan bus_info and iounmap any bbars we previously ioremap'd */ - for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++) - if (bus_info[bus].bbar) - iounmap(bus_info[bus].bbar); - - return ret; -} - -static int __init calgary_init(void) -{ - int ret; - struct pci_dev *dev = NULL; - struct calgary_bus_info *info; - - ret = calgary_locate_bbars(); - if (ret) - return ret; - - /* Purely for kdump kernel case */ - if (is_kdump_kernel()) - get_tce_space_from_tar(); - - do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); - if (!dev) - break; - if (!is_cal_pci_dev(dev->device)) - continue; - - info = &bus_info[dev->bus->number]; - if (info->translation_disabled) { - calgary_init_one_nontraslated(dev); - continue; - } - - if (!info->tce_space && !translate_empty_slots) - continue; - - ret = calgary_init_one(dev); - if (ret) - goto error; - } while (1); - - dev = NULL; - for_each_pci_dev(dev) { - struct iommu_table *tbl; - - tbl = find_iommu_table(&dev->dev); - - if (translation_enabled(tbl)) - dev->dev.dma_ops = &calgary_dma_ops; - } - - return ret; - -error: - do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); - if (!dev) - break; - if (!is_cal_pci_dev(dev->device)) - continue; - - info = &bus_info[dev->bus->number]; - if (info->translation_disabled) { - pci_dev_put(dev); - continue; - } - if (!info->tce_space && !translate_empty_slots) - continue; - - calgary_disable_translation(dev); - calgary_free_bus(dev); - pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ - dev->dev.dma_ops = NULL; - } while (1); - - return ret; -} - -static inline int __init determine_tce_table_size(void) -{ - int ret; - - if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) - return specified_table_size; - - if (is_kdump_kernel() && saved_max_pfn) { - /* - * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to - * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each - * larger table size has twice as many entries, so shift the - * max ram address by 13 to divide by 8K and then look at the - * order of the result to choose between 0-7. - */ - ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13); - if (ret > TCE_TABLE_SIZE_8M) - ret = TCE_TABLE_SIZE_8M; - } else { - /* - * Use 8M by default (suggested by Muli) if it's not - * kdump kernel and saved_max_pfn isn't set. - */ - ret = TCE_TABLE_SIZE_8M; - } - - return ret; -} - -static int __init build_detail_arrays(void) -{ - unsigned long ptr; - unsigned numnodes, i; - int scal_detail_size, rio_detail_size; - - numnodes = rio_table_hdr->num_scal_dev; - if (numnodes > MAX_NUMNODES){ - printk(KERN_WARNING - "Calgary: MAX_NUMNODES too low! Defined as %d, " - "but system has %d nodes.\n", - MAX_NUMNODES, numnodes); - return -ENODEV; - } - - switch (rio_table_hdr->version){ - case 2: - scal_detail_size = 11; - rio_detail_size = 13; - break; - case 3: - scal_detail_size = 12; - rio_detail_size = 15; - break; - default: - printk(KERN_WARNING - "Calgary: Invalid Rio Grande Table Version: %d\n", - rio_table_hdr->version); - return -EPROTO; - } - - ptr = ((unsigned long)rio_table_hdr) + 3; - for (i = 0; i < numnodes; i++, ptr += scal_detail_size) - scal_devs[i] = (struct scal_detail *)ptr; - - for (i = 0; i < rio_table_hdr->num_rio_dev; - i++, ptr += rio_detail_size) - rio_devs[i] = (struct rio_detail *)ptr; - - return 0; -} - -static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) -{ - int dev; - u32 val; - - if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { - /* - * FIXME: properly scan for devices across the - * PCI-to-PCI bridge on every CalIOC2 port. - */ - return 1; - } - - for (dev = 1; dev < 8; dev++) { - val = read_pci_config(bus, dev, 0, 0); - if (val != 0xffffffff) - break; - } - return (val != 0xffffffff); -} - -/* - * calgary_init_bitmap_from_tce_table(): - * Function for kdump case. In the second/kdump kernel initialize - * the bitmap based on the tce table entries obtained from first kernel - */ -static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) -{ - u64 *tp; - unsigned int index; - tp = ((u64 *)tbl->it_base); - for (index = 0 ; index < tbl->it_size; index++) { - if (*tp != 0x0) - set_bit(index, tbl->it_map); - tp++; - } -} - -/* - * get_tce_space_from_tar(): - * Function for kdump case. Get the tce tables from first kernel - * by reading the contents of the base address register of calgary iommu - */ -static void __init get_tce_space_from_tar(void) -{ - int bus; - void __iomem *target; - unsigned long tce_space; - - for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { - struct calgary_bus_info *info = &bus_info[bus]; - unsigned short pci_device; - u32 val; - - val = read_pci_config(bus, 0, 0, 0); - pci_device = (val & 0xFFFF0000) >> 16; - - if (!is_cal_pci_dev(pci_device)) - continue; - if (info->translation_disabled) - continue; - - if (calgary_bus_has_devices(bus, pci_device) || - translate_empty_slots) { - target = calgary_reg(bus_info[bus].bbar, - tar_offset(bus)); - tce_space = be64_to_cpu(readq(target)); - tce_space = tce_space & TAR_SW_BITS; - - tce_space = tce_space & (~specified_table_size); - info->tce_space = (u64 *)__va(tce_space); - } - } - return; -} - -static int __init calgary_iommu_init(void) -{ - int ret; - - /* ok, we're trying to use Calgary - let's roll */ - printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); - - ret = calgary_init(); - if (ret) { - printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " - "falling back to no_iommu\n", ret); - return ret; - } - - return 0; -} - -int __init detect_calgary(void) -{ - int bus; - void *tbl; - int calgary_found = 0; - unsigned long ptr; - unsigned int offset, prev_offset; - int ret; - - /* - * if the user specified iommu=off or iommu=soft or we found - * another HW IOMMU already, bail out. - */ - if (no_iommu || iommu_detected) - return -ENODEV; - - if (!use_calgary) - return -ENODEV; - - if (!early_pci_allowed()) - return -ENODEV; - - printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n"); - - ptr = (unsigned long)phys_to_virt(get_bios_ebda()); - - rio_table_hdr = NULL; - prev_offset = 0; - offset = 0x180; - /* - * The next offset is stored in the 1st word. - * Only parse up until the offset increases: - */ - while (offset > prev_offset) { - /* The block id is stored in the 2nd word */ - if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){ - /* set the pointer past the offset & block id */ - rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); - break; - } - prev_offset = offset; - offset = *((unsigned short *)(ptr + offset)); - } - if (!rio_table_hdr) { - printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table " - "in EBDA - bailing!\n"); - return -ENODEV; - } - - ret = build_detail_arrays(); - if (ret) { - printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret); - return -ENOMEM; - } - - specified_table_size = determine_tce_table_size(); - - for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { - struct calgary_bus_info *info = &bus_info[bus]; - unsigned short pci_device; - u32 val; - - val = read_pci_config(bus, 0, 0, 0); - pci_device = (val & 0xFFFF0000) >> 16; - - if (!is_cal_pci_dev(pci_device)) - continue; - - if (info->translation_disabled) - continue; - - if (calgary_bus_has_devices(bus, pci_device) || - translate_empty_slots) { - /* - * If it is kdump kernel, find and use tce tables - * from first kernel, else allocate tce tables here - */ - if (!is_kdump_kernel()) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - info->tce_space = tbl; - } - calgary_found = 1; - } - } - - printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n", - calgary_found ? "found" : "not found"); - - if (calgary_found) { - iommu_detected = 1; - calgary_detected = 1; - printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); - printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", - specified_table_size); - - x86_init.iommu.iommu_init = calgary_iommu_init; - } - return calgary_found; - -cleanup: - for (--bus; bus >= 0; --bus) { - struct calgary_bus_info *info = &bus_info[bus]; - - if (info->tce_space) - free_tce_table(info->tce_space); - } - return -ENOMEM; -} - -static int __init calgary_parse_options(char *p) -{ - unsigned int bridge; - unsigned long val; - size_t len; - ssize_t ret; - - while (*p) { - if (!strncmp(p, "64k", 3)) - specified_table_size = TCE_TABLE_SIZE_64K; - else if (!strncmp(p, "128k", 4)) - specified_table_size = TCE_TABLE_SIZE_128K; - else if (!strncmp(p, "256k", 4)) - specified_table_size = TCE_TABLE_SIZE_256K; - else if (!strncmp(p, "512k", 4)) - specified_table_size = TCE_TABLE_SIZE_512K; - else if (!strncmp(p, "1M", 2)) - specified_table_size = TCE_TABLE_SIZE_1M; - else if (!strncmp(p, "2M", 2)) - specified_table_size = TCE_TABLE_SIZE_2M; - else if (!strncmp(p, "4M", 2)) - specified_table_size = TCE_TABLE_SIZE_4M; - else if (!strncmp(p, "8M", 2)) - specified_table_size = TCE_TABLE_SIZE_8M; - - len = strlen("translate_empty_slots"); - if (!strncmp(p, "translate_empty_slots", len)) - translate_empty_slots = 1; - - len = strlen("disable"); - if (!strncmp(p, "disable", len)) { - p += len; - if (*p == '=') - ++p; - if (*p == '\0') - break; - ret = kstrtoul(p, 0, &val); - if (ret) - break; - - bridge = val; - if (bridge < MAX_PHB_BUS_NUM) { - printk(KERN_INFO "Calgary: disabling " - "translation for PHB %#x\n", bridge); - bus_info[bridge].translation_disabled = 1; - } - } - - p = strpbrk(p, ","); - if (!p) - break; - - p++; /* skip ',' */ - } - return 1; -} -__setup("calgary=", calgary_parse_options); - -static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) -{ - struct iommu_table *tbl; - unsigned int npages; - int i; - - tbl = pci_iommu(dev->bus); - - for (i = 0; i < 4; i++) { - struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i]; - - /* Don't give out TCEs that map MEM resources */ - if (!(r->flags & IORESOURCE_MEM)) - continue; - - /* 0-based? we reserve the whole 1st MB anyway */ - if (!r->start) - continue; - - /* cover the whole region */ - npages = resource_size(r) >> PAGE_SHIFT; - npages++; - - iommu_range_reserve(tbl, r->start, npages); - } -} - -static int __init calgary_fixup_tce_spaces(void) -{ - struct pci_dev *dev = NULL; - struct calgary_bus_info *info; - - if (no_iommu || swiotlb || !calgary_detected) - return -ENODEV; - - printk(KERN_DEBUG "Calgary: fixing up tce spaces\n"); - - do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); - if (!dev) - break; - if (!is_cal_pci_dev(dev->device)) - continue; - - info = &bus_info[dev->bus->number]; - if (info->translation_disabled) - continue; - - if (!info->tce_space) - continue; - - calgary_fixup_one_tce_space(dev); - - } while (1); - - return 0; -} - -/* - * We need to be call after pcibios_assign_resources (fs_initcall level) - * and before device_initcall. - */ -rootfs_initcall(calgary_fixup_tce_spaces); - -IOMMU_INIT_POST(detect_calgary); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f62b498b18fb..5dcedad21dff 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/dma-direct.h> #include <linux/dma-debug.h> +#include <linux/iommu.h> #include <linux/dmar.h> #include <linux/export.h> #include <linux/memblock.h> @@ -11,7 +12,6 @@ #include <asm/dma.h> #include <asm/iommu.h> #include <asm/gart.h> -#include <asm/calgary.h> #include <asm/x86_init.h> #include <asm/iommu_table.h> @@ -34,21 +34,6 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -/* - * This variable becomes 1 if iommu=pt is passed on the kernel command line. - * If this variable is 1, IOMMU implementations do no DMA translation for - * devices and allow every device to access to whole physical memory. This is - * useful if a user wants to use an IOMMU only for KVM device assignment to - * guests and not for driver dma translation. - * It is also possible to disable by default in kernel config, and enable with - * iommu=nopt at boot time. - */ -#ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH -int iommu_pass_through __read_mostly = 1; -#else -int iommu_pass_through __read_mostly; -#endif - extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; void __init pci_iommu_alloc(void) @@ -120,17 +105,12 @@ static __init int iommu_setup(char *p) swiotlb = 1; #endif if (!strncmp(p, "pt", 2)) - iommu_pass_through = 1; + iommu_set_default_passthrough(true); if (!strncmp(p, "nopt", 4)) - iommu_pass_through = 0; + iommu_set_default_translated(true); gart_parse_options(p); -#ifdef CONFIG_CALGARY_IOMMU - if (!strncmp(p, "calgary", 7)) - use_calgary = 1; -#endif /* CONFIG_CALGARY_IOMMU */ - p += strcspn(p, ","); if (*p == ',') ++p; @@ -160,7 +140,7 @@ rootfs_initcall(pci_iommu_init); static int via_no_dac_cb(struct pci_dev *pdev, void *data) { - pdev->dev.bus_dma_mask = DMA_BIT_MASK(32); + pdev->dev.bus_dma_limit = DMA_BIT_MASK(32); return 0; } diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 5f5302028a9a..c2cfa5e7c152 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -/* Glue code to lib/swiotlb.c */ #include <linux/pci.h> #include <linux/cache.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 75fea0d48c0e..839b5244e3b7 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -41,6 +41,7 @@ #include <asm/desc.h> #include <asm/prctl.h> #include <asm/spec-ctrl.h> +#include <asm/io_bitmap.h> #include <asm/proto.h> #include "process.h" @@ -72,18 +73,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, #endif + .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, }, -#ifdef CONFIG_X86_32 - /* - * Note that the .io_bitmap member must be extra-big. This is because - * the CPU will access an additional byte beyond the end of the IO - * permission bitmap. The extra byte must be all 1 bits, and must - * be within the limit. - */ - .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, -#endif }; EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); @@ -110,28 +102,89 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) void exit_thread(struct task_struct *tsk) { struct thread_struct *t = &tsk->thread; - unsigned long *bp = t->io_bitmap_ptr; struct fpu *fpu = &t->fpu; - if (bp) { - struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu()); - - t->io_bitmap_ptr = NULL; - clear_thread_flag(TIF_IO_BITMAP); - /* - * Careful, clear this in the TSS too: - */ - memset(tss->io_bitmap, 0xff, t->io_bitmap_max); - t->io_bitmap_max = 0; - put_cpu(); - kfree(bp); - } + if (test_thread_flag(TIF_IO_BITMAP)) + io_bitmap_exit(); free_vm86(t); fpu__drop(fpu); } +static int set_new_tls(struct task_struct *p, unsigned long tls) +{ + struct user_desc __user *utls = (struct user_desc __user *)tls; + + if (in_ia32_syscall()) + return do_set_thread_area(p, -1, utls, 0); + else + return do_set_thread_area_64(p, ARCH_SET_FS, tls); +} + +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) +{ + struct inactive_task_frame *frame; + struct fork_frame *fork_frame; + struct pt_regs *childregs; + int ret = 0; + + childregs = task_pt_regs(p); + fork_frame = container_of(childregs, struct fork_frame, regs); + frame = &fork_frame->frame; + + frame->bp = 0; + frame->ret_addr = (unsigned long) ret_from_fork; + p->thread.sp = (unsigned long) fork_frame; + p->thread.io_bitmap = NULL; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + +#ifdef CONFIG_X86_64 + savesegment(gs, p->thread.gsindex); + p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase; + savesegment(fs, p->thread.fsindex); + p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase; + savesegment(es, p->thread.es); + savesegment(ds, p->thread.ds); +#else + p->thread.sp0 = (unsigned long) (childregs + 1); + /* + * Clear all status flags including IF and set fixed bit. 64bit + * does not have this initialization as the frame does not contain + * flags. The flags consistency (especially vs. AC) is there + * ensured via objtool, which lacks 32bit support. + */ + frame->flags = X86_EFLAGS_FIXED; +#endif + + /* Kernel thread ? */ + if (unlikely(p->flags & PF_KTHREAD)) { + memset(childregs, 0, sizeof(struct pt_regs)); + kthread_frame_init(frame, sp, arg); + return 0; + } + + frame->bx = 0; + *childregs = *current_pt_regs(); + childregs->ax = 0; + if (sp) + childregs->sp = sp; + +#ifdef CONFIG_X86_32 + task_user_gs(p) = get_user_gs(current_pt_regs()); +#endif + + /* Set a new TLS for the child thread? */ + if (clone_flags & CLONE_SETTLS) + ret = set_new_tls(p, tls); + + if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP))) + io_bitmap_share(p); + + return ret; +} + void flush_thread(void) { struct task_struct *tsk = current; @@ -269,31 +322,96 @@ void arch_setup_new_exec(void) } } -static inline void switch_to_bitmap(struct thread_struct *prev, - struct thread_struct *next, - unsigned long tifp, unsigned long tifn) +#ifdef CONFIG_X86_IOPL_IOPERM +static inline void tss_invalidate_io_bitmap(struct tss_struct *tss) +{ + /* + * Invalidate the I/O bitmap by moving io_bitmap_base outside the + * TSS limit so any subsequent I/O access from user space will + * trigger a #GP. + * + * This is correct even when VMEXIT rewrites the TSS limit + * to 0x67 as the only requirement is that the base points + * outside the limit. + */ + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID; +} + +static inline void switch_to_bitmap(unsigned long tifp) +{ + /* + * Invalidate I/O bitmap if the previous task used it. This prevents + * any possible leakage of an active I/O bitmap. + * + * If the next task has an I/O bitmap it will handle it on exit to + * user mode. + */ + if (tifp & _TIF_IO_BITMAP) + tss_invalidate_io_bitmap(this_cpu_ptr(&cpu_tss_rw)); +} + +static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) +{ + /* + * Copy at least the byte range of the incoming tasks bitmap which + * covers the permitted I/O ports. + * + * If the previous task which used an I/O bitmap had more bits + * permitted, then the copy needs to cover those as well so they + * get turned off. + */ + memcpy(tss->io_bitmap.bitmap, iobm->bitmap, + max(tss->io_bitmap.prev_max, iobm->max)); + + /* + * Store the new max and the sequence number of this bitmap + * and a pointer to the bitmap itself. + */ + tss->io_bitmap.prev_max = iobm->max; + tss->io_bitmap.prev_sequence = iobm->sequence; +} + +/** + * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode + */ +void tss_update_io_bitmap(void) { struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + struct thread_struct *t = ¤t->thread; + u16 *base = &tss->x86_tss.io_bitmap_base; + + if (!test_thread_flag(TIF_IO_BITMAP)) { + tss_invalidate_io_bitmap(tss); + return; + } + + if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) { + *base = IO_BITMAP_OFFSET_VALID_ALL; + } else { + struct io_bitmap *iobm = t->io_bitmap; - if (tifn & _TIF_IO_BITMAP) { - /* - * Copy the relevant range of the IO bitmap. - * Normally this is 128 bytes or less: - */ - memcpy(tss->io_bitmap, next->io_bitmap_ptr, - max(prev->io_bitmap_max, next->io_bitmap_max)); - /* - * Make sure that the TSS limit is correct for the CPU - * to notice the IO bitmap. - */ - refresh_tss_limit(); - } else if (tifp & _TIF_IO_BITMAP) { /* - * Clear any possible leftover bits: + * Only copy bitmap data when the sequence number differs. The + * update time is accounted to the incoming task. */ - memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); + if (tss->io_bitmap.prev_sequence != iobm->sequence) + tss_copy_io_bitmap(tss, iobm); + + /* Enable the bitmap */ + *base = IO_BITMAP_OFFSET_VALID_MAP; } + + /* + * Make sure that the TSS limit is covering the IO bitmap. It might have + * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O + * access from user space to trigger a #GP because tbe bitmap is outside + * the TSS limit. + */ + refresh_tss_limit(); } +#else /* CONFIG_X86_IOPL_IOPERM */ +static inline void switch_to_bitmap(unsigned long tifp) { } +#endif #ifdef CONFIG_SMP @@ -497,15 +615,12 @@ void speculation_ctrl_update_current(void) void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { - struct thread_struct *prev, *next; unsigned long tifp, tifn; - prev = &prev_p->thread; - next = &next_p->thread; - tifn = READ_ONCE(task_thread_info(next_p)->flags); tifp = READ_ONCE(task_thread_info(prev_p)->flags); - switch_to_bitmap(prev, next, tifp, tifn); + + switch_to_bitmap(tifp); propagate_user_return_notify(prev_p, next_p); @@ -580,7 +695,7 @@ void __cpuidle default_idle(void) safe_halt(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } -#ifdef CONFIG_APM_MODULE +#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) EXPORT_SYMBOL(default_idle); #endif diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h index 320ab978fb1f..1d0797b2338a 100644 --- a/arch/x86/kernel/process.h +++ b/arch/x86/kernel/process.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // // Code shared between 32 and 64 bit diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index b8ceec4974fe..5052ced43373 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -112,74 +112,6 @@ void release_thread(struct task_struct *dead_task) release_vm86_irqs(dead_task); } -int copy_thread_tls(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p, unsigned long tls) -{ - struct pt_regs *childregs = task_pt_regs(p); - struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs); - struct inactive_task_frame *frame = &fork_frame->frame; - struct task_struct *tsk; - int err; - - /* - * For a new task use the RESET flags value since there is no before. - * All the status flags are zero; DF and all the system flags must also - * be 0, specifically IF must be 0 because we context switch to the new - * task with interrupts disabled. - */ - frame->flags = X86_EFLAGS_FIXED; - frame->bp = 0; - frame->ret_addr = (unsigned long) ret_from_fork; - p->thread.sp = (unsigned long) fork_frame; - p->thread.sp0 = (unsigned long) (childregs+1); - memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); - - if (unlikely(p->flags & PF_KTHREAD)) { - /* kernel thread */ - memset(childregs, 0, sizeof(struct pt_regs)); - frame->bx = sp; /* function */ - frame->di = arg; - p->thread.io_bitmap_ptr = NULL; - return 0; - } - frame->bx = 0; - *childregs = *current_pt_regs(); - childregs->ax = 0; - if (sp) - childregs->sp = sp; - - task_user_gs(p) = get_user_gs(current_pt_regs()); - - p->thread.io_bitmap_ptr = NULL; - tsk = current; - err = -ENOMEM; - - if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { - p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, - IO_BITMAP_BYTES, GFP_KERNEL); - if (!p->thread.io_bitmap_ptr) { - p->thread.io_bitmap_max = 0; - return -ENOMEM; - } - set_tsk_thread_flag(p, TIF_IO_BITMAP); - } - - err = 0; - - /* - * Set a new TLS for the child thread? - */ - if (clone_flags & CLONE_SETTLS) - err = do_set_thread_area(p, -1, - (struct user_desc __user *)tls, 0); - - if (err && p->thread.io_bitmap_ptr) { - kfree(p->thread.io_bitmap_ptr); - p->thread.io_bitmap_max = 0; - } - return err; -} - void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { @@ -192,7 +124,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) regs->ip = new_ip; regs->sp = new_sp; regs->flags = X86_EFLAGS_IF; - force_iret(); } EXPORT_SYMBOL_GPL(start_thread); @@ -255,15 +186,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ load_TLS(next, cpu); - /* - * Restore IOPL if needed. In normal use, the flags restore - * in the switch assembly will handle this. But if the kernel - * is running virtualized at a non-zero CPL, the popf will - * not restore flags, so it must be done in a separate step. - */ - if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) - set_iopl_mask(next->iopl); - switch_to_extra(prev_p, next_p); /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index af64519b2695..ffd497804dbc 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -371,81 +371,6 @@ void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) task->thread.gsbase = gsbase; } -int copy_thread_tls(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p, unsigned long tls) -{ - int err; - struct pt_regs *childregs; - struct fork_frame *fork_frame; - struct inactive_task_frame *frame; - struct task_struct *me = current; - - childregs = task_pt_regs(p); - fork_frame = container_of(childregs, struct fork_frame, regs); - frame = &fork_frame->frame; - - frame->bp = 0; - frame->ret_addr = (unsigned long) ret_from_fork; - p->thread.sp = (unsigned long) fork_frame; - p->thread.io_bitmap_ptr = NULL; - - savesegment(gs, p->thread.gsindex); - p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase; - savesegment(fs, p->thread.fsindex); - p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase; - savesegment(es, p->thread.es); - savesegment(ds, p->thread.ds); - memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); - - if (unlikely(p->flags & PF_KTHREAD)) { - /* kernel thread */ - memset(childregs, 0, sizeof(struct pt_regs)); - frame->bx = sp; /* function */ - frame->r12 = arg; - return 0; - } - frame->bx = 0; - *childregs = *current_pt_regs(); - - childregs->ax = 0; - if (sp) - childregs->sp = sp; - - err = -ENOMEM; - if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { - p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, - IO_BITMAP_BYTES, GFP_KERNEL); - if (!p->thread.io_bitmap_ptr) { - p->thread.io_bitmap_max = 0; - return -ENOMEM; - } - set_tsk_thread_flag(p, TIF_IO_BITMAP); - } - - /* - * Set a new TLS for the child thread? - */ - if (clone_flags & CLONE_SETTLS) { -#ifdef CONFIG_IA32_EMULATION - if (in_ia32_syscall()) - err = do_set_thread_area(p, -1, - (struct user_desc __user *)tls, 0); - else -#endif - err = do_arch_prctl_64(p, ARCH_SET_FS, tls); - if (err) - goto out; - } - err = 0; -out: - if (err && p->thread.io_bitmap_ptr) { - kfree(p->thread.io_bitmap_ptr); - p->thread.io_bitmap_max = 0; - } - - return err; -} - static void start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, @@ -469,7 +394,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; - force_iret(); } void @@ -572,17 +496,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) switch_to_extra(prev_p, next_p); -#ifdef CONFIG_XEN_PV - /* - * On Xen PV, IOPL bits in pt_regs->flags have no effect, and - * current_pt_regs()->flags may not match the current task's - * intended IOPL. We need to switch it manually. - */ - if (unlikely(static_cpu_has(X86_FEATURE_XENPV) && - prev->iopl != next->iopl)) - xen_set_iopl_mask(next->iopl); -#endif - if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { /* * AMD CPUs have a misfeature: SYSRET sets the SS selector but diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 3c5bbe8e4120..f0e1ddbc2fd7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -42,6 +42,7 @@ #include <asm/traps.h> #include <asm/syscall.h> #include <asm/fsgsbase.h> +#include <asm/io_bitmap.h> #include "tls.h" @@ -181,6 +182,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset) static int set_segment_reg(struct task_struct *task, unsigned long offset, u16 value) { + if (WARN_ON_ONCE(task == current)) + return -EIO; + /* * The value argument was already truncated to 16 bits. */ @@ -208,10 +212,7 @@ static int set_segment_reg(struct task_struct *task, break; case offsetof(struct user_regs_struct, gs): - if (task == current) - set_user_gs(task_pt_regs(task), value); - else - task_user_gs(task) = value; + task_user_gs(task) = value; } return 0; @@ -271,32 +272,41 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset) static int set_segment_reg(struct task_struct *task, unsigned long offset, u16 value) { + if (WARN_ON_ONCE(task == current)) + return -EIO; + /* * The value argument was already truncated to 16 bits. */ if (invalid_selector(value)) return -EIO; + /* + * This function has some ABI oddities. + * + * A 32-bit ptracer probably expects that writing FS or GS will change + * FSBASE or GSBASE respectively. In the absence of FSGSBASE support, + * this code indeed has that effect. When FSGSBASE is added, this + * will require a special case. + * + * For existing 64-bit ptracers, writing FS or GS *also* currently + * changes the base if the selector is nonzero the next time the task + * is run. This behavior may not be needed, and trying to preserve it + * when FSGSBASE is added would be complicated at best. + */ + switch (offset) { case offsetof(struct user_regs_struct,fs): task->thread.fsindex = value; - if (task == current) - loadsegment(fs, task->thread.fsindex); break; case offsetof(struct user_regs_struct,gs): task->thread.gsindex = value; - if (task == current) - load_gs_index(task->thread.gsindex); break; case offsetof(struct user_regs_struct,ds): task->thread.ds = value; - if (task == current) - loadsegment(ds, task->thread.ds); break; case offsetof(struct user_regs_struct,es): task->thread.es = value; - if (task == current) - loadsegment(es, task->thread.es); break; /* @@ -374,6 +384,9 @@ static int putreg(struct task_struct *child, * When changing the FS base, use do_arch_prctl_64() * to set the index to zero and to set the base * as requested. + * + * NB: This behavior is nonsensical and likely needs to + * change when FSGSBASE support is added. */ if (child->thread.fsbase != value) return do_arch_prctl_64(child, ARCH_SET_FS, value); @@ -697,7 +710,9 @@ static int ptrace_set_debugreg(struct task_struct *tsk, int n, static int ioperm_active(struct task_struct *target, const struct user_regset *regset) { - return target->thread.io_bitmap_max / regset->size; + struct io_bitmap *iobm = target->thread.io_bitmap; + + return iobm ? DIV_ROUND_UP(iobm->max, regset->size) : 0; } static int ioperm_get(struct task_struct *target, @@ -705,12 +720,13 @@ static int ioperm_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - if (!target->thread.io_bitmap_ptr) + struct io_bitmap *iobm = target->thread.io_bitmap; + + if (!iobm) return -ENXIO; return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - target->thread.io_bitmap_ptr, - 0, IO_BITMAP_BYTES); + iobm->bitmap, 0, IO_BITMAP_BYTES); } /* diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 8451f38ad399..896d74cb5081 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -90,8 +90,6 @@ static void ich_force_hpet_resume(void) BUG(); else printk(KERN_DEBUG "Force enabled HPET at resume\n"); - - return; } static void ich_force_enable_hpet(struct pci_dev *dev) @@ -112,7 +110,7 @@ static void ich_force_enable_hpet(struct pci_dev *dev) } /* use bits 31:14, 16 kB aligned */ - rcba_base = ioremap_nocache(rcba, 0x4000); + rcba_base = ioremap(rcba, 0x4000); if (rcba_base == NULL) { dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; " "cannot force enable HPET\n"); @@ -448,7 +446,6 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev) dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", force_hpet_address); cached_dev = dev; - return; } /* ISA Bridges */ @@ -513,7 +510,6 @@ static void e6xx_force_enable_hpet(struct pci_dev *dev) force_hpet_resume_type = NONE_FORCE_HPET_RESUME; dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " "0x%lx\n", force_hpet_address); - return; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU, e6xx_force_enable_hpet); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 09d6bded3c1e..0cc7c0b106bb 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -828,11 +828,6 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) return NMI_HANDLED; } -static void smp_send_nmi_allbutself(void) -{ - apic->send_IPI_allbutself(NMI_VECTOR); -} - /* * Halt all other CPUs, calling the specified function on each of them * @@ -861,7 +856,7 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) */ wmb(); - smp_send_nmi_allbutself(); + apic_send_IPI_allbutself(NMI_VECTOR); /* Kick CPUs looping in NMI context. */ WRITE_ONCE(crash_ipi_issued, 1); diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index ee26df08002e..94b33885f8d2 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -35,8 +35,7 @@ #define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) .text - .globl relocate_kernel -relocate_kernel: +SYM_CODE_START_NOALIGN(relocate_kernel) /* Save the CPU context, used for jumping back */ pushl %ebx @@ -93,8 +92,9 @@ relocate_kernel: addl $(identity_mapped - relocate_kernel), %eax pushl %eax ret +SYM_CODE_END(relocate_kernel) -identity_mapped: +SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) /* set return address to 0 if not preserving context */ pushl $0 /* store the start address on the stack */ @@ -191,8 +191,9 @@ identity_mapped: addl $(virtual_mapped - relocate_kernel), %eax pushl %eax ret +SYM_CODE_END(identity_mapped) -virtual_mapped: +SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) movl CR4(%edi), %eax movl %eax, %cr4 movl CR3(%edi), %eax @@ -208,9 +209,10 @@ virtual_mapped: popl %esi popl %ebx ret +SYM_CODE_END(virtual_mapped) /* Do the copies */ -swap_pages: +SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movl 8(%esp), %edx movl 4(%esp), %ecx pushl %ebp @@ -270,6 +272,7 @@ swap_pages: popl %ebx popl %ebp ret +SYM_CODE_END(swap_pages) .globl kexec_control_code_size .set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index c51ccff5cd01..ef3ba99068d3 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -38,8 +38,7 @@ .text .align PAGE_SIZE .code64 - .globl relocate_kernel -relocate_kernel: +SYM_CODE_START_NOALIGN(relocate_kernel) /* * %rdi indirection_page * %rsi page_list @@ -103,8 +102,9 @@ relocate_kernel: addq $(identity_mapped - relocate_kernel), %r8 pushq %r8 ret +SYM_CODE_END(relocate_kernel) -identity_mapped: +SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) /* set return address to 0 if not preserving context */ pushq $0 /* store the start address on the stack */ @@ -209,8 +209,9 @@ identity_mapped: movq $virtual_mapped, %rax pushq %rax ret +SYM_CODE_END(identity_mapped) -virtual_mapped: +SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) movq RSP(%r8), %rsp movq CR4(%r8), %rax movq %rax, %cr4 @@ -228,9 +229,10 @@ virtual_mapped: popq %rbp popq %rbx ret +SYM_CODE_END(virtual_mapped) /* Do the copies */ -swap_pages: +SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movq %rdi, %rcx /* Put the page_list in %rcx */ xorl %edi, %edi xorl %esi, %esi @@ -283,6 +285,7 @@ swap_pages: jmp 0b 3: ret +SYM_CODE_END(swap_pages) .globl kexec_control_code_size .set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bbe35bf879f5..a74262c71484 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -2,130 +2,54 @@ /* * Copyright (C) 1995 Linus Torvalds * - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 - * - * Memory region support - * David Parsons <orc@pell.chi.il.us>, July-August 1999 - * - * Added E820 sanitization routine (removes overlapping memory regions); - * Brian Moyle <bmoyle@mvista.com>, February 2001 - * - * Moved CPU detection code to cpu/${cpu}.c - * Patrick Mochel <mochel@osdl.org>, March 2002 - * - * Provisions for empty E820 memory regions (reported by certain BIOSes). - * Alex Achenbach <xela@slit.de>, December 2002. - * + * This file contains the setup_arch() code, which handles the architecture-dependent + * parts of early kernel initialization. */ - -/* - * This file handles the architecture-dependent parts of initialization - */ - -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/mmzone.h> -#include <linux/screen_info.h> -#include <linux/ioport.h> -#include <linux/acpi.h> -#include <linux/sfi.h> -#include <linux/apm_bios.h> -#include <linux/initrd.h> -#include <linux/memblock.h> -#include <linux/seq_file.h> #include <linux/console.h> -#include <linux/root_dev.h> -#include <linux/highmem.h> -#include <linux/export.h> +#include <linux/crash_dump.h> +#include <linux/dmi.h> #include <linux/efi.h> -#include <linux/init.h> -#include <linux/edd.h> +#include <linux/init_ohci1394_dma.h> +#include <linux/initrd.h> #include <linux/iscsi_ibft.h> -#include <linux/nodemask.h> -#include <linux/kexec.h> -#include <linux/dmi.h> -#include <linux/pfn.h> +#include <linux/memblock.h> #include <linux/pci.h> -#include <asm/pci-direct.h> -#include <linux/init_ohci1394_dma.h> -#include <linux/kvm_para.h> -#include <linux/dma-contiguous.h> -#include <xen/xen.h> -#include <uapi/linux/mount.h> - -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/stddef.h> -#include <linux/unistd.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/delay.h> - -#include <linux/kallsyms.h> -#include <linux/cpufreq.h> -#include <linux/dma-mapping.h> -#include <linux/ctype.h> -#include <linux/uaccess.h> - -#include <linux/percpu.h> -#include <linux/crash_dump.h> +#include <linux/root_dev.h> +#include <linux/sfi.h> #include <linux/tboot.h> -#include <linux/jiffies.h> -#include <linux/mem_encrypt.h> -#include <linux/sizes.h> - #include <linux/usb/xhci-dbgp.h> -#include <video/edid.h> -#include <asm/mtrr.h> +#include <uapi/linux/mount.h> + +#include <xen/xen.h> + #include <asm/apic.h> -#include <asm/realmode.h> -#include <asm/e820/api.h> -#include <asm/mpspec.h> -#include <asm/setup.h> -#include <asm/efi.h> -#include <asm/timer.h> -#include <asm/i8259.h> -#include <asm/sections.h> -#include <asm/io_apic.h> -#include <asm/ist.h> -#include <asm/setup_arch.h> #include <asm/bios_ebda.h> -#include <asm/cacheflush.h> -#include <asm/processor.h> #include <asm/bugs.h> -#include <asm/kasan.h> - -#include <asm/vsyscall.h> #include <asm/cpu.h> -#include <asm/desc.h> -#include <asm/dma.h> -#include <asm/iommu.h> +#include <asm/efi.h> #include <asm/gart.h> -#include <asm/mmu_context.h> -#include <asm/proto.h> - -#include <asm/paravirt.h> #include <asm/hypervisor.h> -#include <asm/olpc_ofw.h> - -#include <asm/percpu.h> -#include <asm/topology.h> -#include <asm/apicdef.h> -#include <asm/amd_nb.h> +#include <asm/io_apic.h> +#include <asm/kasan.h> +#include <asm/kaslr.h> #include <asm/mce.h> -#include <asm/alternative.h> +#include <asm/mtrr.h> +#include <asm/realmode.h> +#include <asm/olpc_ofw.h> +#include <asm/pci-direct.h> #include <asm/prom.h> -#include <asm/microcode.h> -#include <asm/kaslr.h> +#include <asm/proto.h> #include <asm/unwind.h> +#include <asm/vsyscall.h> +#include <linux/vmalloc.h> /* - * max_low_pfn_mapped: highest direct mapped pfn under 4GB - * max_pfn_mapped: highest direct mapped pfn over 4GB + * max_low_pfn_mapped: highest directly mapped pfn < 4 GB + * max_pfn_mapped: highest directly mapped pfn > 4 GB * * The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are - * represented by pfn_mapped + * represented by pfn_mapped[]. */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; @@ -135,14 +59,30 @@ RESERVE_BRK(dmi_alloc, 65536); #endif -static __initdata unsigned long _brk_start = (unsigned long)__brk_base; -unsigned long _brk_end = (unsigned long)__brk_base; +/* + * Range of the BSS area. The size of the BSS area is determined + * at link time, with RESERVE_BRK*() facility reserving additional + * chunks. + */ +static __initdata +unsigned long _brk_start = (unsigned long)__brk_base; +unsigned long _brk_end = (unsigned long)__brk_base; struct boot_params boot_params; /* - * Machine setup.. + * These are the four main kernel memory regions, we put them into + * the resource tree so that kdump tools and other debugging tools + * recover it: */ + +static struct resource rodata_resource = { + .name = "Kernel rodata", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM +}; + static struct resource data_resource = { .name = "Kernel data", .start = 0, @@ -166,16 +106,16 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 -/* cpu data as detected by the assembly code in head_32.S */ +/* CPU data as detected by the assembly code in head_32.S */ struct cpuinfo_x86 new_cpu_data; -/* common cpu data for all cpus */ +/* Common CPU data for all CPUs */ struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); unsigned int def_to_bigsmp; -/* for MCA, but anyone else can use it if they want */ +/* For MCA, but anyone else can use it if they want */ unsigned int machine_id; unsigned int machine_submodel_id; unsigned int BIOS_revision; @@ -438,6 +378,12 @@ static void __init memblock_x86_reserve_range_setup_data(void) while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); memblock_reserve(pa_data, sizeof(*data) + data->len); + + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) + memblock_reserve(((struct setup_indirect *)data->data)->addr, + ((struct setup_indirect *)data->data)->len); + pa_data = data->next; early_memunmap(data, sizeof(*data)); } @@ -455,15 +401,15 @@ static void __init memblock_x86_reserve_range_setup_data(void) /* * Keep the crash kernel below this limit. * - * On 32 bits earlier kernels would limit the kernel to the low 512 MiB + * Earlier 32-bits kernels would limit the kernel to the low 512 MB range * due to mapping restrictions. * - * On 64bit, kdump kernel need be restricted to be under 64TB, which is - * the upper limit of system RAM in 4-level paing mode. Since the kdump - * jumping could be from 5-level to 4-level, the jumping will fail if - * kernel is put above 64TB, and there's no way to detect the paging mode - * of the kernel which will be loaded for dumping during the 1st kernel - * bootup. + * 64-bit kdump kernels need to be restricted to be under 64 TB, which is + * the upper limit of system RAM in 4-level paging mode. Since the kdump + * jump could be from 5-level paging to 4-level paging, the jump will fail if + * the kernel is put above 64 TB, and during the 1st kernel bootup there's + * no good way to detect the paging mode of the target kernel which will be + * loaded for dumping. */ #ifdef CONFIG_X86_32 # define CRASH_ADDR_LOW_MAX SZ_512M @@ -486,7 +432,7 @@ static int __init reserve_crashkernel_low(void) ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); if (ret) { /* - * two parts from lib/swiotlb.c: + * two parts from kernel/dma/swiotlb.c: * -swiotlb size: user-specified with swiotlb= or default. * * -swiotlb overflow buffer: now hardcoded to 32k. We round it @@ -743,8 +689,8 @@ static void __init trim_bios_range(void) e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); /* - * special case: Some BIOSen report the PC BIOS - * area (640->1Mb) as ram even though it is not. + * special case: Some BIOSes report the PC BIOS + * area (640Kb -> 1Mb) as RAM even though it is not. * take them out. */ e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1); @@ -874,7 +820,7 @@ void __init setup_arch(char **cmdline_p) /* * Note: Quark X1000 CPUs advertise PGE incorrectly and require * a cr3 based tlb flush, so the following __flush_tlb_all() - * will not flush anything because the cpu quirk which clears + * will not flush anything because the CPU quirk which clears * X86_FEATURE_PGE has not been invoked yet. Though due to the * load_cr3() above the TLB has been flushed already. The * quirk is invoked before subsequent calls to __flush_tlb_all() @@ -947,11 +893,11 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = _brk_end; - mpx_mm_init(&init_mm); - code_resource.start = __pa_symbol(_text); code_resource.end = __pa_symbol(_etext)-1; - data_resource.start = __pa_symbol(_etext); + rodata_resource.start = __pa_symbol(__start_rodata); + rodata_resource.end = __pa_symbol(__end_rodata)-1; + data_resource.start = __pa_symbol(_sdata); data_resource.end = __pa_symbol(_edata)-1; bss_resource.start = __pa_symbol(__bss_start); bss_resource.end = __pa_symbol(__bss_stop)-1; @@ -1040,6 +986,7 @@ void __init setup_arch(char **cmdline_p) /* after parse_early_param, so could debug it */ insert_resource(&iomem_resource, &code_resource); + insert_resource(&iomem_resource, &rodata_resource); insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); @@ -1122,17 +1069,15 @@ void __init setup_arch(char **cmdline_p) reserve_bios_regions(); - if (efi_enabled(EFI_MEMMAP)) { - efi_fake_memmap(); - efi_find_mirror(); - efi_esrt_init(); + efi_fake_memmap(); + efi_find_mirror(); + efi_esrt_init(); - /* - * The EFI specification says that boot service code won't be - * called after ExitBootServices(). This is, in fact, a lie. - */ - efi_reserve_boot_services(); - } + /* + * The EFI specification says that boot service code won't be + * called after ExitBootServices(). This is, in fact, a lie. + */ + efi_reserve_boot_services(); /* preallocate 4k for mptable mpc */ e820__memblock_alloc_reserved_mpc_new(); @@ -1281,8 +1226,6 @@ void __init setup_arch(char **cmdline_p) #if defined(CONFIG_VGA_CONSOLE) if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) conswitchp = &vga_con; -#elif defined(CONFIG_DUMMY_CONSOLE) - conswitchp = &dummy_con; #endif #endif x86_init.oem.banner(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 86663874ef04..e6d7894ad127 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -207,8 +207,8 @@ void __init setup_per_cpu_areas(void) pcpu_cpu_distance, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) - pr_warning("%s allocator failed (%d), falling back to page size\n", - pcpu_fc_names[pcpu_chosen_fc], rc); + pr_warn("%s allocator failed (%d), falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 8eb7193e158d..8a29573851a3 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -151,8 +151,6 @@ static int restore_sigcontext(struct pt_regs *regs, err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32)); - force_iret(); - return err; } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 96421f97e75c..b8d4e9c3c070 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -115,46 +115,6 @@ static atomic_t stopping_cpu = ATOMIC_INIT(-1); static bool smp_no_nmi_ipi = false; -/* - * this function sends a 'reschedule' IPI to another CPU. - * it goes straight through and wastes no time serializing - * anything. Worst case is that we lose a reschedule ... - */ -static void native_smp_send_reschedule(int cpu) -{ - if (unlikely(cpu_is_offline(cpu))) { - WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); - return; - } - apic->send_IPI(cpu, RESCHEDULE_VECTOR); -} - -void native_send_call_func_single_ipi(int cpu) -{ - apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR); -} - -void native_send_call_func_ipi(const struct cpumask *mask) -{ - cpumask_var_t allbutself; - - if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) { - apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); - return; - } - - cpumask_copy(allbutself, cpu_online_mask); - __cpumask_clear_cpu(smp_processor_id(), allbutself); - - if (cpumask_equal(mask, allbutself) && - cpumask_equal(cpu_online_mask, cpu_callout_mask)) - apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR); - else - apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); - - free_cpumask_var(allbutself); -} - static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) { /* We are registered on stopping cpu too, avoid spurious NMI */ @@ -179,6 +139,12 @@ asmlinkage __visible void smp_reboot_interrupt(void) irq_exit(); } +static int register_stop_handler(void) +{ + return register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, + NMI_FLAG_FIRST, "smp_stop"); +} + static void native_stop_other_cpus(int wait) { unsigned long flags; @@ -209,42 +175,44 @@ static void native_stop_other_cpus(int wait) /* sync above data before sending IRQ */ wmb(); - apic->send_IPI_allbutself(REBOOT_VECTOR); + apic_send_IPI_allbutself(REBOOT_VECTOR); /* - * Don't wait longer than a second if the caller - * didn't ask us to wait. + * Don't wait longer than a second for IPI completion. The + * wait request is not checked here because that would + * prevent an NMI shutdown attempt in case that not all + * CPUs reach shutdown state. */ timeout = USEC_PER_SEC; - while (num_online_cpus() > 1 && (wait || timeout--)) + while (num_online_cpus() > 1 && timeout--) udelay(1); } - - /* if the REBOOT_VECTOR didn't work, try with the NMI */ - if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) { - if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, - NMI_FLAG_FIRST, "smp_stop")) - /* Note: we ignore failures here */ - /* Hope the REBOOT_IRQ is good enough */ - goto finish; - /* sync above data before sending IRQ */ - wmb(); - - pr_emerg("Shutting down cpus with NMI\n"); + /* if the REBOOT_VECTOR didn't work, try with the NMI */ + if (num_online_cpus() > 1) { + /* + * If NMI IPI is enabled, try to register the stop handler + * and send the IPI. In any case try to wait for the other + * CPUs to stop. + */ + if (!smp_no_nmi_ipi && !register_stop_handler()) { + /* Sync above data before sending IRQ */ + wmb(); - apic->send_IPI_allbutself(NMI_VECTOR); + pr_emerg("Shutting down cpus with NMI\n"); + apic_send_IPI_allbutself(NMI_VECTOR); + } /* - * Don't wait longer than a 10 ms if the caller - * didn't ask us to wait. + * Don't wait longer than 10 ms if the caller didn't + * reqeust it. If wait is true, the machine hangs here if + * one or more CPUs do not reach shutdown state. */ timeout = USEC_PER_MSEC * 10; while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } -finish: local_irq_save(flags); disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fdbd47ceb84d..69881b2d446c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1023,8 +1023,6 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, int *cpu0_nmi_registered) { - volatile u32 *trampoline_status = - (volatile u32 *) __va(real_mode_header->trampoline_status); /* start_ip had better be page-aligned! */ unsigned long start_ip = real_mode_header->trampoline_start; @@ -1116,9 +1114,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, } } - /* mark "stuck" area as not stuck */ - *trampoline_status = 0; - if (x86_platform.legacy.warm_reset) { /* * Cleanup possible dangling ends... @@ -1596,7 +1591,12 @@ int native_cpu_disable(void) if (ret) return ret; - clear_local_APIC(); + /* + * Disable the local APIC. Otherwise IPI broadcasts will reach + * it. It still responds normally to INIT, NMI, SMI, and SIPI + * messages. + */ + apic_soft_disable(); cpu_disable_common(); return 0; diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index f7476ce23b6e..ca3c11a17b5a 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -22,7 +22,6 @@ #include <asm/elf.h> #include <asm/ia32.h> #include <asm/syscalls.h> -#include <asm/mpx.h> /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. @@ -137,10 +136,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_unmapped_area_info info; unsigned long begin, end; - addr = mpx_unmapped_area_check(addr, len, flags); - if (IS_ERR_VALUE(addr)) - return addr; - if (flags & MAP_FIXED) return addr; @@ -180,10 +175,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, unsigned long addr = addr0; struct vm_unmapped_area_info info; - addr = mpx_unmapped_area_check(addr, len, flags); - if (IS_ERR_VALUE(addr)) - return addr; - /* requested length too big for entire address space */ if (len > TASK_SIZE) return -ENOMEM; diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c index 01f0e2263b86..298fc1edd9c9 100644 --- a/arch/x86/kernel/sysfb_simplefb.c +++ b/arch/x86/kernel/sysfb_simplefb.c @@ -90,11 +90,11 @@ __init int create_simplefb(const struct screen_info *si, if (si->orig_video_isVGA == VIDEO_TYPE_VLFB) size <<= 16; length = mode->height * mode->stride; - length = PAGE_ALIGN(length); if (length > size) { printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n"); return -EINVAL; } + length = PAGE_ALIGN(length); /* setup IORESOURCE_MEM as framebuffer memory */ memset(&res, 0, sizeof(res)); diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index a49fe1dcb47e..b89f6ac6a0c0 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -57,7 +57,7 @@ void __init tboot_probe(void) */ if (!e820__mapped_any(boot_params.tboot_addr, boot_params.tboot_addr, E820_TYPE_RESERVED)) { - pr_warning("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n"); + pr_warn("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n"); return; } @@ -65,13 +65,12 @@ void __init tboot_probe(void) set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { - pr_warning("tboot at 0x%llx is invalid\n", - boot_params.tboot_addr); + pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); tboot = NULL; return; } if (tboot->version < 5) { - pr_warning("tboot version is invalid: %u\n", tboot->version); + pr_warn("tboot version is invalid: %u\n", tboot->version); tboot = NULL; return; } @@ -289,7 +288,7 @@ static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) if (sleep_state >= ACPI_S_STATE_COUNT || acpi_shutdown_map[sleep_state] == -1) { - pr_warning("unsupported sleep state 0x%x\n", sleep_state); + pr_warn("unsupported sleep state 0x%x\n", sleep_state); return -1; } @@ -302,7 +301,7 @@ static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b) if (!tboot_enabled()) return 0; - pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); + pr_warn("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); return -ENODEV; } @@ -320,7 +319,7 @@ static int tboot_wait_for_aps(int num_aps) } if (timeout) - pr_warning("tboot wait for APs timeout\n"); + pr_warn("tboot wait for APs timeout\n"); return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); } @@ -355,7 +354,7 @@ static ssize_t tboot_log_read(struct file *file, char __user *user_buf, size_t c void *kbuf; int ret = -EFAULT; - log_base = ioremap_nocache(TBOOT_SERIAL_LOG_ADDR, TBOOT_SERIAL_LOG_SIZE); + log_base = ioremap(TBOOT_SERIAL_LOG_ADDR, TBOOT_SERIAL_LOG_SIZE); if (!log_base) return ret; @@ -516,7 +515,7 @@ int tboot_force_iommu(void) return 1; if (no_iommu || swiotlb || dmar_disabled) - pr_warning("Forcing Intel-IOMMU to enabled\n"); + pr_warn("Forcing Intel-IOMMU to enabled\n"); dmar_disabled = 0; #ifdef CONFIG_SWIOTLB diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c deleted file mode 100644 index 6384be751eff..000000000000 --- a/arch/x86/kernel/tce_64.c +++ /dev/null @@ -1,177 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * This file manages the translation entries for the IBM Calgary IOMMU. - * - * Derived from arch/powerpc/platforms/pseries/iommu.c - * - * Copyright (C) IBM Corporation, 2006 - * - * Author: Jon Mason <jdmason@us.ibm.com> - * Author: Muli Ben-Yehuda <muli@il.ibm.com> - */ - -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/string.h> -#include <linux/pci.h> -#include <linux/dma-mapping.h> -#include <linux/memblock.h> -#include <asm/tce.h> -#include <asm/calgary.h> -#include <asm/proto.h> -#include <asm/cacheflush.h> - -/* flush a tce at 'tceaddr' to main memory */ -static inline void flush_tce(void* tceaddr) -{ - /* a single tce can't cross a cache line */ - if (boot_cpu_has(X86_FEATURE_CLFLUSH)) - clflush(tceaddr); - else - wbinvd(); -} - -void tce_build(struct iommu_table *tbl, unsigned long index, - unsigned int npages, unsigned long uaddr, int direction) -{ - u64* tp; - u64 t; - u64 rpn; - - t = (1 << TCE_READ_SHIFT); - if (direction != DMA_TO_DEVICE) - t |= (1 << TCE_WRITE_SHIFT); - - tp = ((u64*)tbl->it_base) + index; - - while (npages--) { - rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT; - t &= ~TCE_RPN_MASK; - t |= (rpn << TCE_RPN_SHIFT); - - *tp = cpu_to_be64(t); - flush_tce(tp); - - uaddr += PAGE_SIZE; - tp++; - } -} - -void tce_free(struct iommu_table *tbl, long index, unsigned int npages) -{ - u64* tp; - - tp = ((u64*)tbl->it_base) + index; - - while (npages--) { - *tp = cpu_to_be64(0); - flush_tce(tp); - tp++; - } -} - -static inline unsigned int table_size_to_number_of_entries(unsigned char size) -{ - /* - * size is the order of the table, 0-7 - * smallest table is 8K entries, so shift result by 13 to - * multiply by 8K - */ - return (1 << size) << 13; -} - -static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) -{ - unsigned int bitmapsz; - unsigned long bmppages; - int ret; - - tbl->it_busno = dev->bus->number; - - /* set the tce table size - measured in entries */ - tbl->it_size = table_size_to_number_of_entries(specified_table_size); - - /* - * number of bytes needed for the bitmap size in number of - * entries; we need one bit per entry - */ - bitmapsz = tbl->it_size / BITS_PER_BYTE; - bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz)); - if (!bmppages) { - printk(KERN_ERR "Calgary: cannot allocate bitmap\n"); - ret = -ENOMEM; - goto done; - } - - tbl->it_map = (unsigned long*)bmppages; - - memset(tbl->it_map, 0, bitmapsz); - - tbl->it_hint = 0; - - spin_lock_init(&tbl->it_lock); - - return 0; - -done: - return ret; -} - -int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar) -{ - struct iommu_table *tbl; - int ret; - - if (pci_iommu(dev->bus)) { - printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n", - dev, pci_iommu(dev->bus)); - BUG(); - } - - tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL); - if (!tbl) { - printk(KERN_ERR "Calgary: error allocating iommu_table\n"); - ret = -ENOMEM; - goto done; - } - - ret = tce_table_setparms(dev, tbl); - if (ret) - goto free_tbl; - - tbl->bbar = bbar; - - set_pci_iommu(dev->bus, tbl); - - return 0; - -free_tbl: - kfree(tbl); -done: - return ret; -} - -void * __init alloc_tce_table(void) -{ - unsigned int size; - - size = table_size_to_number_of_entries(specified_table_size); - size *= TCE_ENTRY_SIZE; - - return memblock_alloc_low(size, size); -} - -void __init free_tce_table(void *tbl) -{ - unsigned int size; - - if (!tbl) - return; - - size = table_size_to_number_of_entries(specified_table_size); - size *= TCE_ENTRY_SIZE; - - memblock_free(__pa(tbl), size); -} diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 7ce29cee9f9e..d8673d8a779b 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -91,10 +91,18 @@ void __init hpet_time_init(void) static __init void x86_late_time_init(void) { + /* + * Before PIT/HPET init, select the interrupt mode. This is required + * to make the decision whether PIT should be initialized correct. + */ + x86_init.irqs.intr_mode_select(); + + /* Setup the legacy timers */ x86_init.timers.timer_init(); + /* - * After PIT/HPET timers init, select and setup - * the final interrupt mode for delivering IRQs. + * After PIT/HPET timers init, set up the final interrupt mode for + * delivering IRQs. */ x86_init.irqs.intr_mode_init(); tsc_init(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4bb0f8447112..6ef00eb6fbb9 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -37,11 +37,6 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/io.h> - -#if defined(CONFIG_EDAC) -#include <linux/edac.h> -#endif - #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> @@ -57,10 +52,10 @@ #include <asm/mach_traps.h> #include <asm/alternative.h> #include <asm/fpu/xstate.h> -#include <asm/trace/mpx.h> -#include <asm/mpx.h> #include <asm/vm86.h> #include <asm/umip.h> +#include <asm/insn.h> +#include <asm/insn-eval.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -311,8 +306,23 @@ __visible void __noreturn handle_stack_overflow(const char *message, } #endif -#ifdef CONFIG_X86_64 -/* Runs on IST stack */ +#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) +/* + * Runs on an IST stack for x86_64 and on a special task stack for x86_32. + * + * On x86_64, this is more or less a normal kernel entry. Notwithstanding the + * SDM's warnings about double faults being unrecoverable, returning works as + * expected. Presumably what the SDM actually means is that the CPU may get + * the register state wrong on entry, so returning could be a bad idea. + * + * Various CPU engineers have promised that double faults due to an IRET fault + * while the stack is read-only are, in fact, recoverable. + * + * On x86_32, this is entered through a task gate, and regs are synthesized + * from the TSS. Returning is, in principle, okay, but changes to regs will + * be lost. If, for some reason, we need to return to a context with modified + * regs, the shim code could be adjusted to synchronize the registers. + */ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2) { static const char str[] = "double fault"; @@ -416,22 +426,14 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); #endif -#ifdef CONFIG_DOUBLEFAULT - df_debug(regs, error_code); -#endif - /* - * This is always a kernel trap and never fixable (and thus must - * never return). - */ - for (;;) - die(str, regs, error_code); + pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); + die("double fault", regs, error_code); + panic("Machine halted."); } #endif dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) { - const struct mpx_bndcsr *bndcsr; - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); if (notify_die(DIE_TRAP, "bounds", regs, error_code, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) @@ -441,84 +443,60 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) if (!user_mode(regs)) die("bounds", regs, error_code); - if (!cpu_feature_enabled(X86_FEATURE_MPX)) { - /* The exception is not from Intel MPX */ - goto exit_trap; - } + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL); +} - /* - * We need to look at BNDSTATUS to resolve this exception. - * A NULL here might mean that it is in its 'init state', - * which is all zeros which indicates MPX was not - * responsible for the exception. - */ - bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR); - if (!bndcsr) - goto exit_trap; +enum kernel_gp_hint { + GP_NO_HINT, + GP_NON_CANONICAL, + GP_CANONICAL +}; - trace_bounds_exception_mpx(bndcsr); - /* - * The error code field of the BNDSTATUS register communicates status - * information of a bound range exception #BR or operation involving - * bound directory. - */ - switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) { - case 2: /* Bound directory has invalid entry. */ - if (mpx_handle_bd_fault()) - goto exit_trap; - break; /* Success, it was handled */ - case 1: /* Bound violation. */ - { - struct task_struct *tsk = current; - struct mpx_fault_info mpx; - - if (mpx_fault_info(&mpx, regs)) { - /* - * We failed to decode the MPX instruction. Act as if - * the exception was not caused by MPX. - */ - goto exit_trap; - } - /* - * Success, we decoded the instruction and retrieved - * an 'mpx' containing the address being accessed - * which caused the exception. This information - * allows and application to possibly handle the - * #BR exception itself. - */ - if (!do_trap_no_signal(tsk, X86_TRAP_BR, "bounds", regs, - error_code)) - break; +/* + * When an uncaught #GP occurs, try to determine the memory address accessed by + * the instruction and return that address to the caller. Also, try to figure + * out whether any part of the access to that address was non-canonical. + */ +static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, + unsigned long *addr) +{ + u8 insn_buf[MAX_INSN_SIZE]; + struct insn insn; - show_signal(tsk, SIGSEGV, "trap ", "bounds", regs, error_code); + if (probe_kernel_read(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) + return GP_NO_HINT; - force_sig_bnderr(mpx.addr, mpx.lower, mpx.upper); - break; - } - case 0: /* No exception caused by Intel MPX operations. */ - goto exit_trap; - default: - die("bounds", regs, error_code); - } + kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE); + insn_get_modrm(&insn); + insn_get_sib(&insn); - return; + *addr = (unsigned long)insn_get_addr_ref(&insn, regs); + if (*addr == -1UL) + return GP_NO_HINT; -exit_trap: +#ifdef CONFIG_X86_64 /* - * This path out is for all the cases where we could not - * handle the exception in some way (like allocating a - * table or telling userspace about it. We will also end - * up here if the kernel has MPX turned off at compile - * time.. + * Check that: + * - the operand is not in the kernel half + * - the last byte of the operand is not in the user canonical half */ - do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL); + if (*addr < ~__VIRTUAL_MASK && + *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK) + return GP_NON_CANONICAL; +#endif + + return GP_CANONICAL; } -dotraplinkage void -do_general_protection(struct pt_regs *regs, long error_code) +#define GPFSTR "general protection fault" + +dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) { - const char *desc = "general protection fault"; + char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; + enum kernel_gp_hint hint = GP_NO_HINT; struct task_struct *tsk; + unsigned long gp_addr; + int ret; RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); cond_local_irq_enable(regs); @@ -535,48 +513,61 @@ do_general_protection(struct pt_regs *regs, long error_code) } tsk = current; - if (!user_mode(regs)) { - if (fixup_exception(regs, X86_TRAP_GP, error_code, 0)) - return; + if (user_mode(regs)) { tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; - /* - * To be potentially processing a kprobe fault and to - * trust the result from kprobe_running(), we have to - * be non-preemptible. - */ - if (!preemptible() && kprobe_running() && - kprobe_fault_handler(regs, X86_TRAP_GP)) - return; + show_signal(tsk, SIGSEGV, "", desc, regs, error_code); + force_sig(SIGSEGV); - if (notify_die(DIE_GPF, desc, regs, error_code, - X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) - die(desc, regs, error_code); return; } + if (fixup_exception(regs, X86_TRAP_GP, error_code, 0)) + return; + tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; - show_signal(tsk, SIGSEGV, "", desc, regs, error_code); + /* + * To be potentially processing a kprobe fault and to trust the result + * from kprobe_running(), we have to be non-preemptible. + */ + if (!preemptible() && + kprobe_running() && + kprobe_fault_handler(regs, X86_TRAP_GP)) + return; + + ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV); + if (ret == NOTIFY_STOP) + return; + + if (error_code) + snprintf(desc, sizeof(desc), "segment-related " GPFSTR); + else + hint = get_kernel_gp_address(regs, &gp_addr); + + if (hint != GP_NO_HINT) + snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx", + (hint == GP_NON_CANONICAL) ? "probably for non-canonical address" + : "maybe for address", + gp_addr); + + /* + * KASAN is interested only in the non-canonical case, clear it + * otherwise. + */ + if (hint != GP_NON_CANONICAL) + gp_addr = 0; + + die_addr(desc, regs, error_code, gp_addr); - force_sig(SIGSEGV); } NOKPROBE_SYMBOL(do_general_protection); dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) { -#ifdef CONFIG_DYNAMIC_FTRACE - /* - * ftrace must be first, everything else may cause a recursive crash. - * See note by declaration of modifying_ftrace_code in ftrace.c - */ - if (unlikely(atomic_read(&modifying_ftrace_code)) && - ftrace_int3_handler(regs)) - return; -#endif if (poke_int3_handler(regs)) return; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 57d87f79558f..7e322e2daaf5 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -638,7 +638,7 @@ unsigned long native_calibrate_tsc(void) * clock. */ if (crystal_khz == 0 && - boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_X) + boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_D) crystal_khz = 25000; /* @@ -1505,6 +1505,9 @@ void __init tsc_init(void) return; } + if (tsc_clocksource_reliable || no_tsc_watchdog) + clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; + clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); } diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 067858fe4db8..e0cbe4f2af49 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -58,6 +58,10 @@ static const struct freq_desc freq_desc_ann = { 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } }; +static const struct freq_desc freq_desc_lgm = { + 1, { 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000 } +}; + static const struct x86_cpu_id tsc_msr_cpu_ids[] = { INTEL_CPU_FAM6(ATOM_SALTWELL_MID, freq_desc_pnw), INTEL_CPU_FAM6(ATOM_SALTWELL_TABLET, freq_desc_clv), @@ -65,6 +69,7 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = { INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, freq_desc_tng), INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht), INTEL_CPU_FAM6(ATOM_AIRMONT_MID, freq_desc_ann), + INTEL_CPU_FAM6(ATOM_AIRMONT_NP, freq_desc_lgm), {} }; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index ec534f978867..32a818764e03 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -233,7 +233,6 @@ static cycles_t check_tsc_warp(unsigned int timeout) * The measurement runs for 'timeout' msecs: */ end = start + (cycles_t) tsc_khz * timeout; - now = start; for (i = 0; ; i++) { /* @@ -364,12 +363,12 @@ retry: /* Force it to 0 if random warps brought us here */ atomic_set(&test_runs, 0); - pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", + pr_warn("TSC synchronization [CPU#%d -> CPU#%d]:\n", smp_processor_id(), cpu); - pr_warning("Measured %Ld cycles TSC warp between CPUs, " - "turning off TSC clock.\n", max_warp); + pr_warn("Measured %Ld cycles TSC warp between CPUs, " + "turning off TSC clock.\n", max_warp); if (random_warps) - pr_warning("TSC warped randomly between CPUs\n"); + pr_warn("TSC warped randomly between CPUs\n"); mark_tsc_unstable("check_tsc_sync_source failed"); } diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 5b345add550f..4d732a444711 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -1,6 +1,6 @@ /* - * umip.c Emulation for instruction protected by the Intel User-Mode - * Instruction Prevention feature + * umip.c Emulation for instruction protected by the User-Mode Instruction + * Prevention feature * * Copyright (c) 2017, Intel Corporation. * Ricardo Neri <ricardo.neri-calderon@linux.intel.com> @@ -18,10 +18,10 @@ /** DOC: Emulation for User-Mode Instruction Prevention (UMIP) * - * The feature User-Mode Instruction Prevention present in recent Intel - * processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str) - * from being executed with CPL > 0. Otherwise, a general protection fault is - * issued. + * User-Mode Instruction Prevention is a security feature present in recent + * x86 processors that, when enabled, prevents a group of instructions (SGDT, + * SIDT, SLDT, SMSW and STR) from being run in user mode by issuing a general + * protection fault if the instruction is executed with CPL > 0. * * Rather than relaying to the user space the general protection fault caused by * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be @@ -36,8 +36,8 @@ * DOSEMU2) rely on this subset of instructions to function. * * The instructions protected by UMIP can be split in two groups. Those which - * return a kernel memory address (sgdt and sidt) and those which return a - * value (sldt, str and smsw). + * return a kernel memory address (SGDT and SIDT) and those which return a + * value (SLDT, STR and SMSW). * * For the instructions that return a kernel memory address, applications * such as WineHQ rely on the result being located in the kernel memory space, @@ -45,15 +45,13 @@ * value that, lies close to the top of the kernel memory. The limit for the GDT * and the IDT are set to zero. * - * Given that sldt and str are not commonly used in programs that run on WineHQ + * Given that SLDT and STR are not commonly used in programs that run on WineHQ * or DOSEMU2, they are not emulated. * * The instruction smsw is emulated to return the value that the register CR0 * has at boot time as set in the head_32. * - * Also, emulation is provided only for 32-bit processes; 64-bit processes - * that attempt to use the instructions that UMIP protects will receive the - * SIGSEGV signal issued as a consequence of the general protection fault. + * Emulation is provided for both 32-bit and 64-bit processes. * * Care is taken to appropriately emulate the results when segmentation is * used. That is, rather than relying on USER_DS and USER_CS, the function @@ -63,17 +61,18 @@ * application uses a local descriptor table. */ -#define UMIP_DUMMY_GDT_BASE 0xfffe0000 -#define UMIP_DUMMY_IDT_BASE 0xffff0000 +#define UMIP_DUMMY_GDT_BASE 0xfffffffffffe0000ULL +#define UMIP_DUMMY_IDT_BASE 0xffffffffffff0000ULL /* * The SGDT and SIDT instructions store the contents of the global descriptor * table and interrupt table registers, respectively. The destination is a * memory operand of X+2 bytes. X bytes are used to store the base address of - * the table and 2 bytes are used to store the limit. In 32-bit processes, the - * only processes for which emulation is provided, X has a value of 4. + * the table and 2 bytes are used to store the limit. In 32-bit processes X + * has a value of 4, in 64-bit processes X has a value of 8. */ -#define UMIP_GDT_IDT_BASE_SIZE 4 +#define UMIP_GDT_IDT_BASE_SIZE_64BIT 8 +#define UMIP_GDT_IDT_BASE_SIZE_32BIT 4 #define UMIP_GDT_IDT_LIMIT_SIZE 2 #define UMIP_INST_SGDT 0 /* 0F 01 /0 */ @@ -92,7 +91,7 @@ const char * const umip_insns[5] = { #define umip_pr_err(regs, fmt, ...) \ umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__) -#define umip_pr_warning(regs, fmt, ...) \ +#define umip_pr_warn(regs, fmt, ...) \ umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__) /** @@ -189,6 +188,7 @@ static int identify_insn(struct insn *insn) * @umip_inst: A constant indicating the instruction to emulate * @data: Buffer into which the dummy result is stored * @data_size: Size of the emulated result + * @x86_64: true if process is 64-bit, false otherwise * * Emulate an instruction protected by UMIP and provide a dummy result. The * result of the emulation is saved in @data. The size of the results depends @@ -202,11 +202,8 @@ static int identify_insn(struct insn *insn) * 0 on success, -EINVAL on error while emulating. */ static int emulate_umip_insn(struct insn *insn, int umip_inst, - unsigned char *data, int *data_size) + unsigned char *data, int *data_size, bool x86_64) { - unsigned long dummy_base_addr, dummy_value; - unsigned short dummy_limit = 0; - if (!data || !data_size || !insn) return -EINVAL; /* @@ -219,6 +216,9 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, * is always returned irrespective of the operand size. */ if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) { + u64 dummy_base_addr; + u16 dummy_limit = 0; + /* SGDT and SIDT do not use registers operands. */ if (X86_MODRM_MOD(insn->modrm.value) == 3) return -EINVAL; @@ -228,13 +228,24 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, else dummy_base_addr = UMIP_DUMMY_IDT_BASE; - *data_size = UMIP_GDT_IDT_LIMIT_SIZE + UMIP_GDT_IDT_BASE_SIZE; + /* + * 64-bit processes use the entire dummy base address. + * 32-bit processes use the lower 32 bits of the base address. + * dummy_base_addr is always 64 bits, but we memcpy the correct + * number of bytes from it to the destination. + */ + if (x86_64) + *data_size = UMIP_GDT_IDT_BASE_SIZE_64BIT; + else + *data_size = UMIP_GDT_IDT_BASE_SIZE_32BIT; + + memcpy(data + 2, &dummy_base_addr, *data_size); - memcpy(data + 2, &dummy_base_addr, UMIP_GDT_IDT_BASE_SIZE); + *data_size += UMIP_GDT_IDT_LIMIT_SIZE; memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE); } else if (umip_inst == UMIP_INST_SMSW) { - dummy_value = CR0_STATE; + unsigned long dummy_value = CR0_STATE; /* * Even though the CR0 register has 4 bytes, the number @@ -290,11 +301,10 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) * fixup_umip_exception() - Fixup a general protection fault caused by UMIP * @regs: Registers as saved when entering the #GP handler * - * The instructions sgdt, sidt, str, smsw, sldt cause a general protection - * fault if executed with CPL > 0 (i.e., from user space). If the offending - * user-space process is not in long mode, this function fixes the exception - * up and provides dummy results for sgdt, sidt and smsw; str and sldt are not - * fixed up. Also long mode user-space processes are not fixed up. + * The instructions SGDT, SIDT, STR, SMSW and SLDT cause a general protection + * fault if executed with CPL > 0 (i.e., from user space). This function fixes + * the exception up and provides dummy results for SGDT, SIDT and SMSW; STR + * and SLDT are not fixed up. * * If operands are memory addresses, results are copied to user-space memory as * indicated by the instruction pointed by eIP using the registers indicated in @@ -370,16 +380,17 @@ bool fixup_umip_exception(struct pt_regs *regs) if (umip_inst < 0) return false; - umip_pr_warning(regs, "%s instruction cannot be used by applications.\n", + umip_pr_warn(regs, "%s instruction cannot be used by applications.\n", umip_insns[umip_inst]); - /* Do not emulate SLDT, STR or user long mode processes. */ - if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT || user_64bit_mode(regs)) + /* Do not emulate (spoof) SLDT or STR. */ + if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT) return false; - umip_pr_warning(regs, "For now, expensive software emulation returns the result.\n"); + umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n"); - if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size)) + if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size, + user_64bit_mode(regs))) return false; /* diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 332ae6530fa8..e9cc182aa97e 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -187,6 +187,8 @@ static struct orc_entry *orc_find(unsigned long ip) return orc_ftrace_find(ip); } +#ifdef CONFIG_MODULES + static void orc_sort_swap(void *_a, void *_b, int size) { struct orc_entry *orc_a, *orc_b; @@ -229,7 +231,6 @@ static int orc_sort_cmp(const void *_a, const void *_b) return orc_a->sp_reg == ORC_REG_UNDEFINED && !orc_a->end ? -1 : 1; } -#ifdef CONFIG_MODULES void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size, void *_orc, size_t orc_size) { @@ -273,9 +274,11 @@ void __init unwind_init(void) return; } - /* Sort the .orc_unwind and .orc_unwind_ip tables: */ - sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp, - orc_sort_swap); + /* + * Note, the orc_unwind and orc_unwind_ip tables were already + * sorted at build time via the 'sorttable' tool. + * It's ready for binary search straight away, no need to sort it. + */ /* Initialize the fast lookup table: */ lookup_num_blocks = orc_lookup_end - orc_lookup; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index d8359ebeea70..15e5aad8ac2c 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -508,9 +508,12 @@ struct uprobe_xol_ops { void (*abort)(struct arch_uprobe *, struct pt_regs *); }; -static inline int sizeof_long(void) +static inline int sizeof_long(struct pt_regs *regs) { - return in_ia32_syscall() ? 4 : 8; + /* + * Check registers for mode as in_xxx_syscall() does not apply here. + */ + return user_64bit_mode(regs) ? 8 : 4; } static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) @@ -521,9 +524,9 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) static int emulate_push_stack(struct pt_regs *regs, unsigned long val) { - unsigned long new_sp = regs->sp - sizeof_long(); + unsigned long new_sp = regs->sp - sizeof_long(regs); - if (copy_to_user((void __user *)new_sp, &val, sizeof_long())) + if (copy_to_user((void __user *)new_sp, &val, sizeof_long(regs))) return -EFAULT; regs->sp = new_sp; @@ -556,7 +559,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs long correction = utask->vaddr - utask->xol_vaddr; regs->ip += correction; } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) { - regs->sp += sizeof_long(); /* Pop incorrect return address */ + regs->sp += sizeof_long(regs); /* Pop incorrect return address */ if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen)) return -ERESTART; } @@ -675,7 +678,7 @@ static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) * "call" insn was executed out-of-line. Just restore ->sp and restart. * We could also restore ->ip and try to call branch_emulate_op() again. */ - regs->sp += sizeof_long(); + regs->sp += sizeof_long(regs); return -ERESTART; } @@ -839,8 +842,8 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) /** * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. + * @auprobe: the probepoint information. * @mm: the probed address space. - * @arch_uprobe: the probepoint information. * @addr: virtual address at which to install the probepoint * Return 0 on success or a -ve number on error. */ @@ -1056,7 +1059,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { - int rasize = sizeof_long(), nleft; + int rasize = sizeof_long(regs), nleft; unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize)) diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index a024c4f7ba56..641f0fe1e5b4 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -31,7 +31,7 @@ #include <asm/cpufeatures.h> #include <asm/msr-index.h> -ENTRY(verify_cpu) +SYM_FUNC_START_LOCAL(verify_cpu) pushf # Save caller passed flags push $0 # Kill any dangerous flags popf @@ -137,4 +137,4 @@ ENTRY(verify_cpu) popf # Restore caller passed flags xorl %eax, %eax ret -ENDPROC(verify_cpu) +SYM_FUNC_END(verify_cpu) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index a76c12b38e92..91d55454e702 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -381,7 +381,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) mark_screen_rdonly(tsk->mm); memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs)); - force_iret(); return regs->ax; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e2feacf921a0..e3296aa028fe 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -21,6 +21,9 @@ #define LOAD_OFFSET __START_KERNEL_map #endif +#define EMITS_PT_NOTE +#define RO_EXCEPTION_TABLE_ALIGN 16 + #include <asm-generic/vmlinux.lds.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> @@ -141,17 +144,12 @@ SECTIONS *(.text.__x86.indirect_thunk) __indirect_thunk_end = .; #endif + } :text =0xcccc - /* End of text section */ - _etext = .; - } :text = 0x9090 - - NOTES :text :note - - EXCEPTION_TABLE(16) :text = 0x9090 - - /* .text should occupy whole number of pages */ + /* End of text section, which should occupy whole number of pages */ + _etext = .; . = ALIGN(PAGE_SIZE); + X86_ALIGN_RODATA_BEGIN RO_DATA(PAGE_SIZE) X86_ALIGN_RODATA_END @@ -195,12 +193,10 @@ SECTIONS __vvar_beginning_hack = .; /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) \ +#define EMIT_VVAR(name, offset) \ . = __vvar_beginning_hack + offset; \ *(.vvar_ ## name) -#define __VVAR_KERNEL_LDS #include <asm/vvar.h> -#undef __VVAR_KERNEL_LDS #undef EMIT_VVAR /* diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 1bef687faf22..85f1a90c55cd 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -20,7 +20,7 @@ #include <asm/irq.h> #include <asm/io_apic.h> #include <asm/hpet.h> -#include <asm/pat.h> +#include <asm/memtype.h> #include <asm/tsc.h> #include <asm/iommu.h> #include <asm/mach_traps.h> @@ -31,6 +31,28 @@ static int __init iommu_init_noop(void) { return 0; } static void iommu_shutdown_noop(void) { } bool __init bool_x86_init_noop(void) { return false; } void x86_op_int_noop(int cpu) { } +static __init int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } +static __init void get_rtc_noop(struct timespec64 *now) { } + +static __initconst const struct of_device_id of_cmos_match[] = { + { .compatible = "motorola,mc146818" }, + {} +}; + +/* + * Allow devicetree configured systems to disable the RTC by setting the + * corresponding DT node's status property to disabled. Code is optimized + * out for CONFIG_OF=n builds. + */ +static __init void x86_wallclock_init(void) +{ + struct device_node *node = of_find_matching_node(NULL, of_cmos_match); + + if (node && !of_device_is_available(node)) { + x86_platform.get_wallclock = get_rtc_noop; + x86_platform.set_wallclock = set_rtc_noop; + } +} /* * The platform setup functions are preset with the default functions @@ -58,6 +80,7 @@ struct x86_init_ops x86_init __initdata = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, .trap_init = x86_init_noop, + .intr_mode_select = apic_intr_mode_select, .intr_mode_init = apic_intr_mode_init }, @@ -73,7 +96,7 @@ struct x86_init_ops x86_init __initdata = { .timers = { .setup_percpu_clockev = setup_boot_APIC_clock, .timer_init = hpet_time_init, - .wallclock_init = x86_init_noop, + .wallclock_init = x86_wallclock_init, }, .iommu = { @@ -95,6 +118,7 @@ struct x86_init_ops x86_init __initdata = { }, .acpi = { + .set_root_pointer = x86_default_set_root_pointer, .get_root_pointer = x86_default_get_root_pointer, .reduced_hw_early_init = acpi_generic_reduced_hw_init, }, |