diff options
Diffstat (limited to 'arch/x86')
79 files changed, 899 insertions, 769 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e487493bbd47..7b6fd68b4715 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1070,7 +1070,7 @@ config X86_MCE_THRESHOLD def_bool y config X86_MCE_INJECT - depends on X86_MCE + depends on X86_MCE && X86_LOCAL_APIC tristate "Machine check injector support" ---help--- Provide support for injecting machine checks for testing purposes. diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index ff01c8fc76f7..801c7a158e55 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -32,160 +32,13 @@ static void setup_boot_services##bits(struct efi_config *c) \ \ table = (typeof(table))sys_table; \ \ + c->runtime_services = table->runtime; \ c->boot_services = table->boottime; \ c->text_output = table->con_out; \ } BOOT_SERVICES(32); BOOT_SERVICES(64); -void efi_char16_printk(efi_system_table_t *, efi_char16_t *); - -static efi_status_t -__file_size32(void *__fh, efi_char16_t *filename_16, - void **handle, u64 *file_sz) -{ - efi_file_handle_32_t *h, *fh = __fh; - efi_file_info_t *info; - efi_status_t status; - efi_guid_t info_guid = EFI_FILE_INFO_ID; - u32 info_sz; - - status = efi_early->call((unsigned long)fh->open, fh, &h, filename_16, - EFI_FILE_MODE_READ, (u64)0); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to open file: "); - efi_char16_printk(sys_table, filename_16); - efi_printk(sys_table, "\n"); - return status; - } - - *handle = h; - - info_sz = 0; - status = efi_early->call((unsigned long)h->get_info, h, &info_guid, - &info_sz, NULL); - if (status != EFI_BUFFER_TOO_SMALL) { - efi_printk(sys_table, "Failed to get file info size\n"); - return status; - } - -grow: - status = efi_call_early(allocate_pool, EFI_LOADER_DATA, - info_sz, (void **)&info); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for file info\n"); - return status; - } - - status = efi_early->call((unsigned long)h->get_info, h, &info_guid, - &info_sz, info); - if (status == EFI_BUFFER_TOO_SMALL) { - efi_call_early(free_pool, info); - goto grow; - } - - *file_sz = info->file_size; - efi_call_early(free_pool, info); - - if (status != EFI_SUCCESS) - efi_printk(sys_table, "Failed to get initrd info\n"); - - return status; -} - -static efi_status_t -__file_size64(void *__fh, efi_char16_t *filename_16, - void **handle, u64 *file_sz) -{ - efi_file_handle_64_t *h, *fh = __fh; - efi_file_info_t *info; - efi_status_t status; - efi_guid_t info_guid = EFI_FILE_INFO_ID; - u64 info_sz; - - status = efi_early->call((unsigned long)fh->open, fh, &h, filename_16, - EFI_FILE_MODE_READ, (u64)0); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to open file: "); - efi_char16_printk(sys_table, filename_16); - efi_printk(sys_table, "\n"); - return status; - } - - *handle = h; - - info_sz = 0; - status = efi_early->call((unsigned long)h->get_info, h, &info_guid, - &info_sz, NULL); - if (status != EFI_BUFFER_TOO_SMALL) { - efi_printk(sys_table, "Failed to get file info size\n"); - return status; - } - -grow: - status = efi_call_early(allocate_pool, EFI_LOADER_DATA, - info_sz, (void **)&info); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for file info\n"); - return status; - } - - status = efi_early->call((unsigned long)h->get_info, h, &info_guid, - &info_sz, info); - if (status == EFI_BUFFER_TOO_SMALL) { - efi_call_early(free_pool, info); - goto grow; - } - - *file_sz = info->file_size; - efi_call_early(free_pool, info); - - if (status != EFI_SUCCESS) - efi_printk(sys_table, "Failed to get initrd info\n"); - - return status; -} -efi_status_t -efi_file_size(efi_system_table_t *sys_table, void *__fh, - efi_char16_t *filename_16, void **handle, u64 *file_sz) -{ - if (efi_early->is64) - return __file_size64(__fh, filename_16, handle, file_sz); - - return __file_size32(__fh, filename_16, handle, file_sz); -} - -efi_status_t -efi_file_read(void *handle, unsigned long *size, void *addr) -{ - unsigned long func; - - if (efi_early->is64) { - efi_file_handle_64_t *fh = handle; - - func = (unsigned long)fh->read; - return efi_early->call(func, handle, size, addr); - } else { - efi_file_handle_32_t *fh = handle; - - func = (unsigned long)fh->read; - return efi_early->call(func, handle, size, addr); - } -} - -efi_status_t efi_file_close(void *handle) -{ - if (efi_early->is64) { - efi_file_handle_64_t *fh = handle; - - return efi_early->call((unsigned long)fh->close, handle); - } else { - efi_file_handle_32_t *fh = handle; - - return efi_early->call((unsigned long)fh->close, handle); - } -} - static inline efi_status_t __open_volume32(void *__image, void **__fh) { efi_file_io_interface_t *io; @@ -249,30 +102,8 @@ efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh) void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) { - unsigned long output_string; - size_t offset; - - if (efi_early->is64) { - struct efi_simple_text_output_protocol_64 *out; - u64 *func; - - offset = offsetof(typeof(*out), output_string); - output_string = efi_early->text_output + offset; - out = (typeof(out))(unsigned long)efi_early->text_output; - func = (u64 *)output_string; - - efi_early->call(*func, out, str); - } else { - struct efi_simple_text_output_protocol_32 *out; - u32 *func; - - offset = offsetof(typeof(*out), output_string); - output_string = efi_early->text_output + offset; - out = (typeof(out))(unsigned long)efi_early->text_output; - func = (u32 *)output_string; - - efi_early->call(*func, out, str); - } + efi_call_proto(efi_simple_text_output_protocol, output_string, + efi_early->text_output, str); } static efi_status_t @@ -1157,6 +988,13 @@ struct boot_params *efi_main(struct efi_config *c, else setup_boot_services32(efi_early); + /* + * If the boot loader gave us a value for secure_boot then we use that, + * otherwise we ask the BIOS. + */ + if (boot_params->secure_boot == efi_secureboot_mode_unset) + boot_params->secure_boot = efi_get_secureboot(sys_table); + setup_graphics(boot_params); setup_efi_pci(boot_params); diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index fd0b6a272dd5..d85b9625e836 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -82,7 +82,7 @@ ENTRY(efi_pe_entry) /* Relocate efi_config->call() */ leal efi32_config(%esi), %eax - add %esi, 32(%eax) + add %esi, 40(%eax) pushl %eax call make_boot_params @@ -108,7 +108,7 @@ ENTRY(efi32_stub_entry) /* Relocate efi_config->call() */ leal efi32_config(%esi), %eax - add %esi, 32(%eax) + add %esi, 40(%eax) pushl %eax 2: call efi_main @@ -264,7 +264,7 @@ relocated: #ifdef CONFIG_EFI_STUB .data efi32_config: - .fill 4,8,0 + .fill 5,8,0 .long efi_call_phys .long 0 .byte 0 diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 4d85e600db78..d2ae1f821e0c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -264,7 +264,7 @@ ENTRY(efi_pe_entry) /* * Relocate efi_config->call(). */ - addq %rbp, efi64_config+32(%rip) + addq %rbp, efi64_config+40(%rip) movq %rax, %rdi call make_boot_params @@ -284,7 +284,7 @@ handover_entry: * Relocate efi_config->call(). */ movq efi_config(%rip), %rax - addq %rbp, 32(%rax) + addq %rbp, 40(%rax) 2: movq efi_config(%rip), %rdi call efi_main @@ -456,14 +456,14 @@ efi_config: #ifdef CONFIG_EFI_MIXED .global efi32_config efi32_config: - .fill 4,8,0 + .fill 5,8,0 .quad efi64_thunk .byte 0 #endif .global efi64_config efi64_config: - .fill 4,8,0 + .fill 5,8,0 .quad efi_call .byte 1 #endif /* CONFIG_EFI_STUB */ diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index cc3bd583dce1..9e240fcba784 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -14,6 +14,7 @@ #include <linux/types.h> #include "ctype.h" +#include "string.h" int memcmp(const void *s1, const void *s2, size_t len) { diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h index 725e820602b1..113588ddb43f 100644 --- a/arch/x86/boot/string.h +++ b/arch/x86/boot/string.h @@ -18,4 +18,13 @@ int memcmp(const void *s1, const void *s2, size_t len); #define memset(d,c,l) __builtin_memset(d,c,l) #define memcmp __builtin_memcmp +extern int strcmp(const char *str1, const char *str2); +extern int strncmp(const char *cs, const char *ct, size_t count); +extern size_t strlen(const char *s); +extern char *strstr(const char *s1, const char *s2); +extern size_t strnlen(const char *s, size_t maxlen); +extern unsigned int atou(const char *s); +extern unsigned long long simple_strtoull(const char *cp, char **endp, + unsigned int base); + #endif /* BOOT_STRING_H */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 6ef688a1ef3e..7ff1b0c86a8e 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1085,9 +1085,9 @@ static void aesni_free_simds(void) aesni_simd_skciphers[i]; i++) simd_skcipher_free(aesni_simd_skciphers[i]); - for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2) && - aesni_simd_skciphers2[i].simd; i++) - simd_skcipher_free(aesni_simd_skciphers2[i].simd); + for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2); i++) + if (aesni_simd_skciphers2[i].simd) + simd_skcipher_free(aesni_simd_skciphers2[i].simd); } static int __init aesni_init(void) @@ -1168,7 +1168,7 @@ static int __init aesni_init(void) simd = simd_skcipher_create_compat(algname, drvname, basename); err = PTR_ERR(simd); if (IS_ERR(simd)) - goto unregister_simds; + continue; aesni_simd_skciphers2[i].simd = simd; } diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 701d29f8e4d3..57f7ec35216e 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -255,23 +255,6 @@ ENTRY(__switch_to_asm) END(__switch_to_asm) /* - * The unwinder expects the last frame on the stack to always be at the same - * offset from the end of the page, which allows it to validate the stack. - * Calling schedule_tail() directly would break that convention because its an - * asmlinkage function so its argument has to be pushed on the stack. This - * wrapper creates a proper "end of stack" frame header before the call. - */ -ENTRY(schedule_tail_wrapper) - FRAME_BEGIN - - pushl %eax - call schedule_tail - popl %eax - - FRAME_END - ret -ENDPROC(schedule_tail_wrapper) -/* * A newly forked process directly context switches into this address. * * eax: prev task we switched from @@ -279,15 +262,24 @@ ENDPROC(schedule_tail_wrapper) * edi: kernel thread arg */ ENTRY(ret_from_fork) - call schedule_tail_wrapper + FRAME_BEGIN /* help unwinder find end of stack */ + + /* + * schedule_tail() is asmlinkage so we have to put its 'prev' argument + * on the stack. + */ + pushl %eax + call schedule_tail + popl %eax testl %ebx, %ebx jnz 1f /* kernel threads are uncommon */ 2: /* When we fork, we trace the syscall return in the child, too. */ - movl %esp, %eax + leal FRAME_OFFSET(%esp), %eax call syscall_return_slowpath + FRAME_END jmp restore_all /* kernel thread */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5b219707c2f2..044d18ebc43c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -36,6 +36,7 @@ #include <asm/smap.h> #include <asm/pgtable_types.h> #include <asm/export.h> +#include <asm/frame.h> #include <linux/err.h> .code64 @@ -408,17 +409,19 @@ END(__switch_to_asm) * r12: kernel thread arg */ ENTRY(ret_from_fork) + FRAME_BEGIN /* help unwinder find end of stack */ movq %rax, %rdi - call schedule_tail /* rdi: 'prev' task parameter */ + call schedule_tail /* rdi: 'prev' task parameter */ - testq %rbx, %rbx /* from kernel_thread? */ - jnz 1f /* kernel threads are uncommon */ + testq %rbx, %rbx /* from kernel_thread? */ + jnz 1f /* kernel threads are uncommon */ 2: - movq %rsp, %rdi + leaq FRAME_OFFSET(%rsp),%rdi /* pt_regs pointer */ call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ SWAPGS + FRAME_END jmp restore_regs_and_iret 1: diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 1d392c39fe56..b8ccdb5c9244 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,11 +1,4 @@ -obj-y += core.o - -obj-$(CONFIG_CPU_SUP_AMD) += amd/core.o amd/uncore.o -obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += amd/power.o -obj-$(CONFIG_X86_LOCAL_APIC) += amd/ibs.o msr.o -ifdef CONFIG_AMD_IOMMU -obj-$(CONFIG_CPU_SUP_AMD) += amd/iommu.o -endif - -obj-$(CONFIG_CPU_SUP_INTEL) += msr.o +obj-y += core.o +obj-y += amd/ +obj-$(CONFIG_X86_LOCAL_APIC) += msr.o obj-$(CONFIG_CPU_SUP_INTEL) += intel/ diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile new file mode 100644 index 000000000000..b1da46f396e0 --- /dev/null +++ b/arch/x86/events/amd/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_CPU_SUP_AMD) += core.o uncore.o +obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o +obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o +ifdef CONFIG_AMD_IOMMU +obj-$(CONFIG_CPU_SUP_AMD) += iommu.o +endif + diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 05612a2529c8..496e60391fac 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1010,7 +1010,7 @@ static __init int amd_ibs_init(void) * all online cpus. */ cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING, - "perf/x86/amd/ibs:STARTING", + "perf/x86/amd/ibs:starting", x86_pmu_amd_ibs_starting_cpu, x86_pmu_amd_ibs_dying_cpu); diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index a0b1bdb3ad42..4d1f7f2d9aff 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -22,13 +22,17 @@ #define NUM_COUNTERS_NB 4 #define NUM_COUNTERS_L2 4 -#define MAX_COUNTERS NUM_COUNTERS_NB +#define NUM_COUNTERS_L3 6 +#define MAX_COUNTERS 6 #define RDPMC_BASE_NB 6 -#define RDPMC_BASE_L2 10 +#define RDPMC_BASE_LLC 10 #define COUNTER_SHIFT 16 +static int num_counters_llc; +static int num_counters_nb; + static HLIST_HEAD(uncore_unused_list); struct amd_uncore { @@ -45,30 +49,30 @@ struct amd_uncore { }; static struct amd_uncore * __percpu *amd_uncore_nb; -static struct amd_uncore * __percpu *amd_uncore_l2; +static struct amd_uncore * __percpu *amd_uncore_llc; static struct pmu amd_nb_pmu; -static struct pmu amd_l2_pmu; +static struct pmu amd_llc_pmu; static cpumask_t amd_nb_active_mask; -static cpumask_t amd_l2_active_mask; +static cpumask_t amd_llc_active_mask; static bool is_nb_event(struct perf_event *event) { return event->pmu->type == amd_nb_pmu.type; } -static bool is_l2_event(struct perf_event *event) +static bool is_llc_event(struct perf_event *event) { - return event->pmu->type == amd_l2_pmu.type; + return event->pmu->type == amd_llc_pmu.type; } static struct amd_uncore *event_to_amd_uncore(struct perf_event *event) { if (is_nb_event(event) && amd_uncore_nb) return *per_cpu_ptr(amd_uncore_nb, event->cpu); - else if (is_l2_event(event) && amd_uncore_l2) - return *per_cpu_ptr(amd_uncore_l2, event->cpu); + else if (is_llc_event(event) && amd_uncore_llc) + return *per_cpu_ptr(amd_uncore_llc, event->cpu); return NULL; } @@ -183,16 +187,16 @@ static int amd_uncore_event_init(struct perf_event *event) return -ENOENT; /* - * NB and L2 counters (MSRs) are shared across all cores that share the - * same NB / L2 cache. Interrupts can be directed to a single target - * core, however, event counts generated by processes running on other - * cores cannot be masked out. So we do not support sampling and - * per-thread events. + * NB and Last level cache counters (MSRs) are shared across all cores + * that share the same NB / Last level cache. Interrupts can be directed + * to a single target core, however, event counts generated by processes + * running on other cores cannot be masked out. So we do not support + * sampling and per-thread events. */ if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) return -EINVAL; - /* NB and L2 counters do not have usr/os/guest/host bits */ + /* NB and Last level cache counters do not have usr/os/guest/host bits */ if (event->attr.exclude_user || event->attr.exclude_kernel || event->attr.exclude_host || event->attr.exclude_guest) return -EINVAL; @@ -226,8 +230,8 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, if (pmu->type == amd_nb_pmu.type) active_mask = &amd_nb_active_mask; - else if (pmu->type == amd_l2_pmu.type) - active_mask = &amd_l2_active_mask; + else if (pmu->type == amd_llc_pmu.type) + active_mask = &amd_llc_active_mask; else return 0; @@ -244,30 +248,47 @@ static struct attribute_group amd_uncore_attr_group = { .attrs = amd_uncore_attrs, }; -PMU_FORMAT_ATTR(event, "config:0-7,32-35"); -PMU_FORMAT_ATTR(umask, "config:8-15"); - -static struct attribute *amd_uncore_format_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - NULL, -}; - -static struct attribute_group amd_uncore_format_group = { - .name = "format", - .attrs = amd_uncore_format_attr, +/* + * Similar to PMU_FORMAT_ATTR but allowing for format_attr to be assigned based + * on family + */ +#define AMD_FORMAT_ATTR(_dev, _name, _format) \ +static ssize_t \ +_dev##_show##_name(struct device *dev, \ + struct device_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct device_attribute format_attr_##_dev##_name = __ATTR_RO(_dev); + +/* Used for each uncore counter type */ +#define AMD_ATTRIBUTE(_name) \ +static struct attribute *amd_uncore_format_attr_##_name[] = { \ + &format_attr_event_##_name.attr, \ + &format_attr_umask.attr, \ + NULL, \ +}; \ +static struct attribute_group amd_uncore_format_group_##_name = { \ + .name = "format", \ + .attrs = amd_uncore_format_attr_##_name, \ +}; \ +static const struct attribute_group *amd_uncore_attr_groups_##_name[] = { \ + &amd_uncore_attr_group, \ + &amd_uncore_format_group_##_name, \ + NULL, \ }; -static const struct attribute_group *amd_uncore_attr_groups[] = { - &amd_uncore_attr_group, - &amd_uncore_format_group, - NULL, -}; +AMD_FORMAT_ATTR(event, , "config:0-7,32-35"); +AMD_FORMAT_ATTR(umask, , "config:8-15"); +AMD_FORMAT_ATTR(event, _df, "config:0-7,32-35,59-60"); +AMD_FORMAT_ATTR(event, _l3, "config:0-7"); +AMD_ATTRIBUTE(df); +AMD_ATTRIBUTE(l3); static struct pmu amd_nb_pmu = { .task_ctx_nr = perf_invalid_context, - .attr_groups = amd_uncore_attr_groups, - .name = "amd_nb", .event_init = amd_uncore_event_init, .add = amd_uncore_add, .del = amd_uncore_del, @@ -276,10 +297,8 @@ static struct pmu amd_nb_pmu = { .read = amd_uncore_read, }; -static struct pmu amd_l2_pmu = { +static struct pmu amd_llc_pmu = { .task_ctx_nr = perf_invalid_context, - .attr_groups = amd_uncore_attr_groups, - .name = "amd_l2", .event_init = amd_uncore_event_init, .add = amd_uncore_add, .del = amd_uncore_del, @@ -296,14 +315,14 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) static int amd_uncore_cpu_up_prepare(unsigned int cpu) { - struct amd_uncore *uncore_nb = NULL, *uncore_l2; + struct amd_uncore *uncore_nb = NULL, *uncore_llc; if (amd_uncore_nb) { uncore_nb = amd_uncore_alloc(cpu); if (!uncore_nb) goto fail; uncore_nb->cpu = cpu; - uncore_nb->num_counters = NUM_COUNTERS_NB; + uncore_nb->num_counters = num_counters_nb; uncore_nb->rdpmc_base = RDPMC_BASE_NB; uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; uncore_nb->active_mask = &amd_nb_active_mask; @@ -312,18 +331,18 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; } - if (amd_uncore_l2) { - uncore_l2 = amd_uncore_alloc(cpu); - if (!uncore_l2) + if (amd_uncore_llc) { + uncore_llc = amd_uncore_alloc(cpu); + if (!uncore_llc) goto fail; - uncore_l2->cpu = cpu; - uncore_l2->num_counters = NUM_COUNTERS_L2; - uncore_l2->rdpmc_base = RDPMC_BASE_L2; - uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL; - uncore_l2->active_mask = &amd_l2_active_mask; - uncore_l2->pmu = &amd_l2_pmu; - uncore_l2->id = -1; - *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2; + uncore_llc->cpu = cpu; + uncore_llc->num_counters = num_counters_llc; + uncore_llc->rdpmc_base = RDPMC_BASE_LLC; + uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL; + uncore_llc->active_mask = &amd_llc_active_mask; + uncore_llc->pmu = &amd_llc_pmu; + uncore_llc->id = -1; + *per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc; } return 0; @@ -376,17 +395,17 @@ static int amd_uncore_cpu_starting(unsigned int cpu) *per_cpu_ptr(amd_uncore_nb, cpu) = uncore; } - if (amd_uncore_l2) { + if (amd_uncore_llc) { unsigned int apicid = cpu_data(cpu).apicid; unsigned int nshared; - uncore = *per_cpu_ptr(amd_uncore_l2, cpu); + uncore = *per_cpu_ptr(amd_uncore_llc, cpu); cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx); nshared = ((eax >> 14) & 0xfff) + 1; uncore->id = apicid - (apicid % nshared); - uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2); - *per_cpu_ptr(amd_uncore_l2, cpu) = uncore; + uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc); + *per_cpu_ptr(amd_uncore_llc, cpu) = uncore; } return 0; @@ -419,8 +438,8 @@ static int amd_uncore_cpu_online(unsigned int cpu) if (amd_uncore_nb) uncore_online(cpu, amd_uncore_nb); - if (amd_uncore_l2) - uncore_online(cpu, amd_uncore_l2); + if (amd_uncore_llc) + uncore_online(cpu, amd_uncore_llc); return 0; } @@ -456,8 +475,8 @@ static int amd_uncore_cpu_down_prepare(unsigned int cpu) if (amd_uncore_nb) uncore_down_prepare(cpu, amd_uncore_nb); - if (amd_uncore_l2) - uncore_down_prepare(cpu, amd_uncore_l2); + if (amd_uncore_llc) + uncore_down_prepare(cpu, amd_uncore_llc); return 0; } @@ -479,8 +498,8 @@ static int amd_uncore_cpu_dead(unsigned int cpu) if (amd_uncore_nb) uncore_dead(cpu, amd_uncore_nb); - if (amd_uncore_l2) - uncore_dead(cpu, amd_uncore_l2); + if (amd_uncore_llc) + uncore_dead(cpu, amd_uncore_llc); return 0; } @@ -492,6 +511,47 @@ static int __init amd_uncore_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) goto fail_nodev; + switch(boot_cpu_data.x86) { + case 23: + /* Family 17h: */ + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L3; + /* + * For Family17h, the NorthBridge counters are + * re-purposed as Data Fabric counters. Also, support is + * added for L3 counters. The pmus are exported based on + * family as either L2 or L3 and NB or DF. + */ + amd_nb_pmu.name = "amd_df"; + amd_llc_pmu.name = "amd_l3"; + format_attr_event_df.show = &event_show_df; + format_attr_event_l3.show = &event_show_l3; + break; + case 22: + /* Family 16h - may change: */ + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L2; + amd_nb_pmu.name = "amd_nb"; + amd_llc_pmu.name = "amd_l2"; + format_attr_event_df = format_attr_event; + format_attr_event_l3 = format_attr_event; + break; + default: + /* + * All prior families have the same number of + * NorthBridge and Last Level Cache counters + */ + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L2; + amd_nb_pmu.name = "amd_nb"; + amd_llc_pmu.name = "amd_l2"; + format_attr_event_df = format_attr_event; + format_attr_event_l3 = format_attr_event; + break; + } + amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df; + amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3; + if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) goto fail_nodev; @@ -510,16 +570,16 @@ static int __init amd_uncore_init(void) } if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) { - amd_uncore_l2 = alloc_percpu(struct amd_uncore *); - if (!amd_uncore_l2) { + amd_uncore_llc = alloc_percpu(struct amd_uncore *); + if (!amd_uncore_llc) { ret = -ENOMEM; - goto fail_l2; + goto fail_llc; } - ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1); + ret = perf_pmu_register(&amd_llc_pmu, amd_llc_pmu.name, -1); if (ret) - goto fail_l2; + goto fail_llc; - pr_info("perf: AMD L2I counters detected\n"); + pr_info("perf: AMD LLC counters detected\n"); ret = 0; } @@ -529,7 +589,7 @@ static int __init amd_uncore_init(void) if (cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP, "perf/x86/amd/uncore:prepare", amd_uncore_cpu_up_prepare, amd_uncore_cpu_dead)) - goto fail_l2; + goto fail_llc; if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, "perf/x86/amd/uncore:starting", @@ -546,11 +606,11 @@ fail_start: cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING); fail_prep: cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP); -fail_l2: +fail_llc: if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) perf_pmu_unregister(&amd_nb_pmu); - if (amd_uncore_l2) - free_percpu(amd_uncore_l2); + if (amd_uncore_llc) + free_percpu(amd_uncore_llc); fail_nb: if (amd_uncore_nb) free_percpu(amd_uncore_nb); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 019c5887b698..1635c0c8df23 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -505,6 +505,10 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip > precise) return -EOPNOTSUPP; + + /* There's no sense in having PEBS for non sampling events: */ + if (!is_sampling_event(event)) + return -EINVAL; } /* * check that PEBS LBR correction does not conflict with diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 86138267b68a..eb1484c86bb4 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3176,13 +3176,16 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { for_each_cpu(i, topology_sibling_cpumask(cpu)) { + struct cpu_hw_events *sibling; struct intel_excl_cntrs *c; - c = per_cpu(cpu_hw_events, i).excl_cntrs; + sibling = &per_cpu(cpu_hw_events, i); + c = sibling->excl_cntrs; if (c && c->core_id == core_id) { cpuc->kfree_on_online[1] = cpuc->excl_cntrs; cpuc->excl_cntrs = c; - cpuc->excl_thread_id = 1; + if (!sibling->excl_thread_id) + cpuc->excl_thread_id = 1; break; } } @@ -3987,7 +3990,7 @@ __init int intel_pmu_init(void) x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; } - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1; if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index fec8a461bdef..aff4b5b69d40 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -434,6 +434,7 @@ static struct pmu cstate_core_pmu = { .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static struct pmu cstate_pkg_pmu = { @@ -447,6 +448,7 @@ static struct pmu cstate_pkg_pmu = { .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static const struct cstate_model nhm_cstates __initconst = { @@ -539,6 +541,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNL, knl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), { }, diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index be202390bbd3..9dfeeeca0ea8 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) continue; /* log dropped samples number */ - if (error[bit]) + if (error[bit]) { perf_log_lost_samples(event, error[bit]); + if (perf_event_account_interrupt(event)) + x86_pmu_stop(event, 0); + } + if (counts[bit]) { __intel_pmu_pebs_event(event, iregs, base, top, bit, counts[bit]); diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 1c1b9fe705c8..5900471ee508 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -99,18 +99,24 @@ static struct attribute_group pt_cap_group = { }; PMU_FORMAT_ATTR(cyc, "config:1" ); +PMU_FORMAT_ATTR(pwr_evt, "config:4" ); +PMU_FORMAT_ATTR(fup_on_ptw, "config:5" ); PMU_FORMAT_ATTR(mtc, "config:9" ); PMU_FORMAT_ATTR(tsc, "config:10" ); PMU_FORMAT_ATTR(noretcomp, "config:11" ); +PMU_FORMAT_ATTR(ptw, "config:12" ); PMU_FORMAT_ATTR(mtc_period, "config:14-17" ); PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" ); PMU_FORMAT_ATTR(psb_period, "config:24-27" ); static struct attribute *pt_formats_attr[] = { &format_attr_cyc.attr, + &format_attr_pwr_evt.attr, + &format_attr_fup_on_ptw.attr, &format_attr_mtc.attr, &format_attr_tsc.attr, &format_attr_noretcomp.attr, + &format_attr_ptw.attr, &format_attr_mtc_period.attr, &format_attr_cyc_thresh.attr, &format_attr_psb_period.attr, diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index bd34124449b0..22054ca49026 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -161,7 +161,13 @@ static u64 rapl_timer_ms; static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) { - return rapl_pmus->pmus[topology_logical_package_id(cpu)]; + unsigned int pkgid = topology_logical_package_id(cpu); + + /* + * The unsigned check also catches the '-1' return value for non + * existent mappings in the topology map. + */ + return pkgid < rapl_pmus->maxpkg ? rapl_pmus->pmus[pkgid] : NULL; } static inline u64 rapl_read_counter(struct perf_event *event) @@ -402,6 +408,8 @@ static int rapl_pmu_event_init(struct perf_event *event) /* must be done before validate_group */ pmu = cpu_to_rapl_pmu(event->cpu); + if (!pmu) + return -EINVAL; event->cpu = pmu->cpu; event->pmu_private = pmu; event->hw.event_base = msr; @@ -585,6 +593,20 @@ static int rapl_cpu_online(unsigned int cpu) struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); int target; + if (!pmu) { + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); + if (!pmu) + return -ENOMEM; + + raw_spin_lock_init(&pmu->lock); + INIT_LIST_HEAD(&pmu->active_list); + pmu->pmu = &rapl_pmus->pmu; + pmu->timer_interval = ms_to_ktime(rapl_timer_ms); + rapl_hrtimer_init(pmu); + + rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu; + } + /* * Check if there is an online cpu in the package which collects rapl * events already. @@ -598,27 +620,6 @@ static int rapl_cpu_online(unsigned int cpu) return 0; } -static int rapl_cpu_prepare(unsigned int cpu) -{ - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); - - if (pmu) - return 0; - - pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); - if (!pmu) - return -ENOMEM; - - raw_spin_lock_init(&pmu->lock); - INIT_LIST_HEAD(&pmu->active_list); - pmu->pmu = &rapl_pmus->pmu; - pmu->timer_interval = ms_to_ktime(rapl_timer_ms); - pmu->cpu = -1; - rapl_hrtimer_init(pmu); - rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu; - return 0; -} - static int rapl_check_hw_unit(bool apply_quirk) { u64 msr_rapl_power_unit_bits; @@ -697,6 +698,7 @@ static int __init init_rapl_pmus(void) rapl_pmus->pmu.start = rapl_pmu_event_start; rapl_pmus->pmu.stop = rapl_pmu_event_stop; rapl_pmus->pmu.read = rapl_pmu_event_read; + rapl_pmus->pmu.module = THIS_MODULE; return 0; } @@ -769,6 +771,9 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, hsx_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), {}, }; @@ -802,29 +807,21 @@ static int __init rapl_pmu_init(void) /* * Install callbacks. Core will call them for each online cpu. */ - - ret = cpuhp_setup_state(CPUHP_PERF_X86_RAPL_PREP, "perf/x86/rapl:prepare", - rapl_cpu_prepare, NULL); - if (ret) - goto out; - ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE, "perf/x86/rapl:online", rapl_cpu_online, rapl_cpu_offline); if (ret) - goto out1; + goto out; ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); if (ret) - goto out2; + goto out1; rapl_advertise(); return 0; -out2: - cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); out1: - cpuhp_remove_state(CPUHP_PERF_X86_RAPL_PREP); + cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); out: pr_warn("Initialization failed (%d), disabled\n", ret); cleanup_rapl_pmus(); @@ -835,7 +832,6 @@ module_init(rapl_pmu_init); static void __exit intel_rapl_exit(void) { cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE); - cpuhp_remove_state_nocalls(CPUHP_PERF_X86_RAPL_PREP); perf_pmu_unregister(&rapl_pmus->pmu); cleanup_rapl_pmus(); } diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 97c246f84dea..758c1aa5009d 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -100,7 +100,13 @@ ssize_t uncore_event_show(struct kobject *kobj, struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) { - return pmu->boxes[topology_logical_package_id(cpu)]; + unsigned int pkgid = topology_logical_package_id(cpu); + + /* + * The unsigned check also catches the '-1' return value for non + * existent mappings in the topology map. + */ + return pkgid < max_packages ? pmu->boxes[pkgid] : NULL; } u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) @@ -733,6 +739,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) .start = uncore_pmu_event_start, .stop = uncore_pmu_event_stop, .read = uncore_pmu_event_read, + .module = THIS_MODULE, }; } else { pmu->pmu = *pmu->type->pmu; @@ -763,30 +770,6 @@ static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu) pmu->registered = false; } -static void __uncore_exit_boxes(struct intel_uncore_type *type, int cpu) -{ - struct intel_uncore_pmu *pmu = type->pmus; - struct intel_uncore_box *box; - int i, pkg; - - if (pmu) { - pkg = topology_physical_package_id(cpu); - for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; - if (box) - uncore_box_exit(box); - } - } -} - -static void uncore_exit_boxes(void *dummy) -{ - struct intel_uncore_type **types; - - for (types = uncore_msr_uncores; *types; types++) - __uncore_exit_boxes(*types++, smp_processor_id()); -} - static void uncore_free_boxes(struct intel_uncore_pmu *pmu) { int pkg; @@ -1057,86 +1040,6 @@ static void uncore_pci_exit(void) } } -static int uncore_cpu_dying(unsigned int cpu) -{ - struct intel_uncore_type *type, **types = uncore_msr_uncores; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - int i, pkg; - - pkg = topology_logical_package_id(cpu); - for (; *types; types++) { - type = *types; - pmu = type->pmus; - for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; - if (box && atomic_dec_return(&box->refcnt) == 0) - uncore_box_exit(box); - } - } - return 0; -} - -static int first_init; - -static int uncore_cpu_starting(unsigned int cpu) -{ - struct intel_uncore_type *type, **types = uncore_msr_uncores; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - int i, pkg, ncpus = 1; - - if (first_init) { - /* - * On init we get the number of online cpus in the package - * and set refcount for all of them. - */ - ncpus = cpumask_weight(topology_core_cpumask(cpu)); - } - - pkg = topology_logical_package_id(cpu); - for (; *types; types++) { - type = *types; - pmu = type->pmus; - for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; - if (!box) - continue; - /* The first cpu on a package activates the box */ - if (atomic_add_return(ncpus, &box->refcnt) == ncpus) - uncore_box_init(box); - } - } - - return 0; -} - -static int uncore_cpu_prepare(unsigned int cpu) -{ - struct intel_uncore_type *type, **types = uncore_msr_uncores; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - int i, pkg; - - pkg = topology_logical_package_id(cpu); - for (; *types; types++) { - type = *types; - pmu = type->pmus; - for (i = 0; i < type->num_boxes; i++, pmu++) { - if (pmu->boxes[pkg]) - continue; - /* First cpu of a package allocates the box */ - box = uncore_alloc_box(type, cpu_to_node(cpu)); - if (!box) - return -ENOMEM; - box->pmu = pmu; - box->pkgid = pkg; - pmu->boxes[pkg] = box; - } - } - return 0; -} - static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu, int new_cpu) { @@ -1176,12 +1079,14 @@ static void uncore_change_context(struct intel_uncore_type **uncores, static int uncore_event_cpu_offline(unsigned int cpu) { - int target; + struct intel_uncore_type *type, **types = uncore_msr_uncores; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, pkg, target; /* Check if exiting cpu is used for collecting uncore events */ if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) - return 0; - + goto unref; /* Find a new cpu to collect uncore events */ target = cpumask_any_but(topology_core_cpumask(cpu), cpu); @@ -1193,12 +1098,82 @@ static int uncore_event_cpu_offline(unsigned int cpu) uncore_change_context(uncore_msr_uncores, cpu, target); uncore_change_context(uncore_pci_uncores, cpu, target); + +unref: + /* Clear the references */ + pkg = topology_logical_package_id(cpu); + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + box = pmu->boxes[pkg]; + if (box && atomic_dec_return(&box->refcnt) == 0) + uncore_box_exit(box); + } + } return 0; } +static int allocate_boxes(struct intel_uncore_type **types, + unsigned int pkg, unsigned int cpu) +{ + struct intel_uncore_box *box, *tmp; + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + LIST_HEAD(allocated); + int i; + + /* Try to allocate all required boxes */ + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + if (pmu->boxes[pkg]) + continue; + box = uncore_alloc_box(type, cpu_to_node(cpu)); + if (!box) + goto cleanup; + box->pmu = pmu; + box->pkgid = pkg; + list_add(&box->active_list, &allocated); + } + } + /* Install them in the pmus */ + list_for_each_entry_safe(box, tmp, &allocated, active_list) { + list_del_init(&box->active_list); + box->pmu->boxes[pkg] = box; + } + return 0; + +cleanup: + list_for_each_entry_safe(box, tmp, &allocated, active_list) { + list_del_init(&box->active_list); + kfree(box); + } + return -ENOMEM; +} + static int uncore_event_cpu_online(unsigned int cpu) { - int target; + struct intel_uncore_type *type, **types = uncore_msr_uncores; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, ret, pkg, target; + + pkg = topology_logical_package_id(cpu); + ret = allocate_boxes(types, pkg, cpu); + if (ret) + return ret; + + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + box = pmu->boxes[pkg]; + if (!box && atomic_inc_return(&box->refcnt) == 1) + uncore_box_init(box); + } + } /* * Check if there is an online cpu in the package @@ -1353,6 +1328,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, skx_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init), {}, }; @@ -1388,38 +1365,16 @@ static int __init intel_uncore_init(void) if (cret && pret) return -ENODEV; - /* - * Install callbacks. Core will call them for each online cpu. - * - * The first online cpu of each package allocates and takes - * the refcounts for all other online cpus in that package. - * If msrs are not enabled no allocation is required and - * uncore_cpu_prepare() is not called for each online cpu. - */ - if (!cret) { - ret = cpuhp_setup_state(CPUHP_PERF_X86_UNCORE_PREP, - "perf/x86/intel/uncore:prepare", - uncore_cpu_prepare, NULL); - if (ret) - goto err; - } else { - cpuhp_setup_state_nocalls(CPUHP_PERF_X86_UNCORE_PREP, - "perf/x86/intel/uncore:prepare", - uncore_cpu_prepare, NULL); - } - first_init = 1; - cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_STARTING, - "perf/x86/uncore:starting", - uncore_cpu_starting, uncore_cpu_dying); - first_init = 0; - cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE, - "perf/x86/uncore:online", - uncore_event_cpu_online, uncore_event_cpu_offline); + /* Install hotplug callbacks to setup the targets for each package */ + ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE, + "perf/x86/intel/uncore:online", + uncore_event_cpu_online, + uncore_event_cpu_offline); + if (ret) + goto err; return 0; err: - /* Undo box->init_box() */ - on_each_cpu_mask(&uncore_cpu_mask, uncore_exit_boxes, NULL, 1); uncore_types_exit(uncore_msr_uncores); uncore_pci_exit(); return ret; @@ -1428,9 +1383,7 @@ module_init(intel_uncore_init); static void __exit intel_uncore_exit(void) { - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_UNCORE_ONLINE); - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_UNCORE_STARTING); - cpuhp_remove_state_nocalls(CPUHP_PERF_X86_UNCORE_PREP); + cpuhp_remove_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE); uncore_types_exit(uncore_msr_uncores); uncore_pci_exit(); } diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index e6832be714bc..dae2fedc1601 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -2686,7 +2686,7 @@ static struct intel_uncore_type *hswep_msr_uncores[] = { void hswep_uncore_cpu_init(void) { - int pkg = topology_phys_to_logical_pkg(0); + int pkg = boot_cpu_data.logical_proc_id; if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 2b892e2313a9..5d6a53fd7521 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -7,7 +7,6 @@ generated-y += unistd_64_x32.h generated-y += xen-hypercalls.h generic-y += clkdev.h -generic-y += cputime.h generic-y += dma-contiguous.h generic-y += early_ioremap.h generic-y += mcs_spinlock.h diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index ced283ac79df..af95c47d5c9e 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -59,6 +59,17 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) } #define div_u64_rem div_u64_rem +static inline u64 mul_u32_u32(u32 a, u32 b) +{ + u32 high, low; + + asm ("mull %[b]" : "=a" (low), "=d" (high) + : [a] "a" (a), [b] "rm" (b) ); + + return low | ((u64)high) << 32; +} +#define mul_u32_u32 mul_u32_u32 + #else # include <asm-generic/div64.h> #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index e99675b9c861..2f77bcefe6b4 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -191,6 +191,7 @@ static inline efi_status_t efi_thunk_set_virtual_address_map( struct efi_config { u64 image_handle; u64 table; + u64 runtime_services; u64 boot_services; u64 text_output; efi_status_t (*call)(unsigned long, ...); @@ -226,6 +227,10 @@ static inline bool efi_is_64bit(void) #define __efi_call_early(f, ...) \ __efi_early()->call((unsigned long)f, __VA_ARGS__); +#define efi_call_runtime(f, ...) \ + __efi_early()->call(efi_table_attr(efi_runtime_services, f, \ + __efi_early()->runtime_services), __VA_ARGS__) + extern bool efi_reboot_required(void); #else diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 34a46dc076d3..8167fdb67ae8 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -57,7 +57,7 @@ #define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ #define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ #define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ -#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Annidale */ +#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C #define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 5132f2a6c0a2..e63873683d4a 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -97,10 +97,6 @@ #define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */ -/* Software defined banks */ -#define MCE_EXTENDED_BANK 128 -#define MCE_THERMAL_BANK (MCE_EXTENDED_BANK + 0) - #define MCE_LOG_LEN 32 #define MCE_LOG_SIGNATURE "MACHINECHECK" @@ -193,6 +189,15 @@ extern struct mce_vendor_flags mce_flags; extern struct mca_config mca_cfg; extern struct mca_msr_regs msr_ops; + +enum mce_notifier_prios { + MCE_PRIO_SRAO = INT_MAX, + MCE_PRIO_EXTLOG = INT_MAX - 1, + MCE_PRIO_NFIT = INT_MAX - 2, + MCE_PRIO_EDAC = INT_MAX - 3, + MCE_PRIO_LOWEST = 0, +}; + extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); @@ -306,8 +311,6 @@ extern void (*deferred_error_int_vector)(void); void intel_init_thermal(struct cpuinfo_x86 *c); -void mce_log_therm_throt_event(__u64 status); - /* Interrupt Handler for core thermal thresholds */ extern int (*platform_thermal_notify)(__u64 msr_val); @@ -362,12 +365,13 @@ struct smca_hwid { unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ u32 hwid_mcatype; /* (hwid,mcatype) tuple */ u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */ + u8 count; /* Number of instances. */ }; struct smca_bank { struct smca_hwid *hwid; - /* Instance ID */ - u32 id; + u32 id; /* Value of MCA_IPID[InstanceId]. */ + u8 sysfs_id; /* Value used for sysfs name. */ }; extern struct smca_bank smca_banks[MAX_NR_BANKS]; diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 38711df3bcb5..2266f864b747 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -140,6 +140,7 @@ extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); void reload_early_microcode(void); extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); +extern bool initrd_gone; #else static inline int __init microcode_init(void) { return 0; }; static inline void __init load_ucode_bsp(void) { } diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 195becc6f780..e793fc9a9b20 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -52,6 +52,21 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) +static inline u32 intel_get_microcode_revision(void) +{ + u32 rev, dummy; + + native_wrmsrl(MSR_IA32_UCODE_REV, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + native_cpuid_eax(1); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev); + + return rev; +} + #ifdef CONFIG_MICROCODE_INTEL extern void __init load_ucode_intel_bsp(void); extern void load_ucode_intel_ap(void); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index eaf100508c36..e6cfe7ba2d65 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -104,6 +104,7 @@ struct cpuinfo_x86 { __u8 x86_phys_bits; /* CPUID returned core id bits: */ __u8 x86_coreid_bits; + __u8 cu_id; /* Max extended CPUID function supported: */ __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ @@ -219,6 +220,24 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, : "memory"); } +#define native_cpuid_reg(reg) \ +static inline unsigned int native_cpuid_##reg(unsigned int op) \ +{ \ + unsigned int eax = op, ebx, ecx = 0, edx; \ + \ + native_cpuid(&eax, &ebx, &ecx, &edx); \ + \ + return reg; \ +} + +/* + * Native CPUID functions returning a single datum. + */ +native_cpuid_reg(eax) +native_cpuid_reg(ebx) +native_cpuid_reg(ecx) +native_cpuid_reg(edx) + static inline void load_cr3(pgd_t *pgdir) { write_cr3(__pa(pgdir)); diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index a3269c897ec5..2e41c50ddf47 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -58,7 +58,7 @@ get_frame_pointer(struct task_struct *task, struct pt_regs *regs) if (task == current) return __builtin_frame_address(0); - return (unsigned long *)((struct inactive_task_frame *)task->thread.sp)->bp; + return &((struct inactive_task_frame *)task->thread.sp)->bp; } #else static inline unsigned long * diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 5cb436acd463..fcc5cd387fd1 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -36,7 +36,10 @@ static inline void prepare_switch_to(struct task_struct *prev, asmlinkage void ret_from_fork(void); -/* data that is pointed to by thread.sp */ +/* + * This is the structure pointed to by thread.sp for an inactive task. The + * order of the fields must match the code in __switch_to_asm(). + */ struct inactive_task_frame { #ifdef CONFIG_X86_64 unsigned long r15; @@ -48,6 +51,11 @@ struct inactive_task_frame { unsigned long di; #endif unsigned long bx; + + /* + * These two fields must be together. They form a stack frame header, + * needed by get_frame_pointer(). + */ unsigned long bp; unsigned long ret_addr; }; diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index b10bf319ed20..5138dacf8bb8 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -135,7 +135,8 @@ struct boot_params { __u8 eddbuf_entries; /* 0x1e9 */ __u8 edd_mbr_sig_buf_entries; /* 0x1ea */ __u8 kbd_status; /* 0x1eb */ - __u8 _pad5[3]; /* 0x1ec */ + __u8 secure_boot; /* 0x1ec */ + __u8 _pad5[2]; /* 0x1ed */ /* * The sentinel is set to a nonzero value (0xff) in header.S. * diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 64422f850e95..7ff007ed899d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -35,6 +35,7 @@ #include <linux/bootmem.h> #include <linux/ioport.h> #include <linux/pci.h> +#include <linux/efi-bgrt.h> #include <asm/irqdomain.h> #include <asm/pci_x86.h> @@ -1557,6 +1558,12 @@ int __init early_acpi_boot_init(void) return 0; } +static int __init acpi_parse_bgrt(struct acpi_table_header *table) +{ + efi_bgrt_init(table); + return 0; +} + int __init acpi_boot_init(void) { /* those are executed after early-quirks are executed */ @@ -1581,6 +1588,8 @@ int __init acpi_boot_init(void) acpi_process_madt(); acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); + if (IS_ENABLED(CONFIG_ACPI_BGRT)) + acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); if (!acpi_noirq) x86_init.pci.init = pci_acpi_init; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 945e512a112a..bd6b8c270c24 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2115,6 +2115,7 @@ static inline void __init check_timer(void) if (idx != -1 && irq_trigger(idx)) unmask_ioapic_irq(irq_get_chip_data(0)); } + irq_domain_deactivate_irq(irq_data); irq_domain_activate_irq(irq_data); if (timer_irq_works()) { if (disable_timer_pin_1 > 0) @@ -2136,6 +2137,7 @@ static inline void __init check_timer(void) * legacy devices should be connected to IO APIC #0 */ replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2); + irq_domain_deactivate_irq(irq_data); irq_domain_activate_irq(irq_data); legacy_pic->unmask(0); if (timer_irq_works()) { diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 45d44c173cf9..4a7080c84a5a 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -905,8 +905,8 @@ static int apm_cpu_idle(struct cpuidle_device *dev, { static int use_apm_idle; /* = 0 */ static unsigned int last_jiffies; /* = 0 */ - static unsigned int last_stime; /* = 0 */ - cputime_t stime, utime; + static u64 last_stime; /* = 0 */ + u64 stime, utime; int apm_idle_done = 0; unsigned int jiffies_since_last_check = jiffies - last_jiffies; @@ -919,7 +919,7 @@ recalc: } else if (jiffies_since_last_check > idle_period) { unsigned int idle_percentage; - idle_percentage = cputime_to_jiffies(stime - last_stime); + idle_percentage = nsecs_to_jiffies(stime - last_stime); idle_percentage *= 100; idle_percentage /= jiffies_since_last_check; use_apm_idle = (idle_percentage > idle_threshold); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index c62e015b126c..de827d6ac8c2 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -81,6 +81,7 @@ void common(void) { BLANK(); OFFSET(BP_scratch, boot_params, scratch); + OFFSET(BP_secure_boot, boot_params, secure_boot); OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 71cae73a5076..4e95b2e0d95f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -312,12 +312,19 @@ static void amd_get_topology(struct cpuinfo_x86 *c) u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - node_id = ecx & 7; - /* get compute unit information */ - smp_num_siblings = ((ebx >> 8) & 3) + 1; - c->x86_max_cores /= smp_num_siblings; - c->cpu_core_id = ebx & 0xff; + node_id = ecx & 0xff; + smp_num_siblings = ((ebx >> 8) & 0xff) + 1; + + if (c->x86 == 0x15) + c->cu_id = ebx & 0xff; + + if (c->x86 >= 0x17) { + c->cpu_core_id = ebx & 0xff; + + if (smp_num_siblings > 1) + c->x86_max_cores /= smp_num_siblings; + } /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -548,8 +555,10 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (!check_tsc_unstable()) - set_sched_clock_stable(); + if (check_tsc_unstable()) + clear_sched_clock_stable(); + } else { + clear_sched_clock_stable(); } /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 1661d8ec9280..2c234a6d94c4 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,5 +1,5 @@ -#include <linux/bitops.h> -#include <linux/kernel.h> + +#include <linux/sched.h> #include <asm/cpufeature.h> #include <asm/e820.h> @@ -104,6 +104,8 @@ static void early_init_centaur(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #endif + + clear_sched_clock_stable(); } static void init_centaur(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index dc1697ca5191..3bcf6d880611 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -83,6 +83,7 @@ static void default_init(struct cpuinfo_x86 *c) strcpy(c->x86_model_id, "386"); } #endif + clear_sched_clock_stable(); } static const struct cpu_dev default_cpu = { @@ -1015,6 +1016,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; c->x86_coreid_bits = 0; + c->cu_id = 0xff; #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1055,6 +1057,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) */ if (this_cpu->c_init) this_cpu->c_init(c); + else + clear_sched_clock_stable(); /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); @@ -1221,7 +1225,7 @@ static __init int setup_disablecpuid(char *arg) { int bit; - if (get_option(&arg, &bit) && bit < NCAPINTS*32) + if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32) setup_clear_cpu_cap(bit); else return 0; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index bd9dcd6b712d..47416f959a48 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -9,6 +9,7 @@ #include <asm/pci-direct.h> #include <asm/tsc.h> #include <asm/cpufeature.h> +#include <linux/sched.h> #include "cpu.h" @@ -183,6 +184,7 @@ static void early_init_cyrix(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); break; } + clear_sched_clock_stable(); } static void init_cyrix(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fcd484d2bb03..026c728d6ba7 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -14,6 +14,7 @@ #include <asm/bugs.h> #include <asm/cpu.h> #include <asm/intel-family.h> +#include <asm/microcode_intel.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -78,14 +79,8 @@ static void early_init_intel(struct cpuinfo_x86 *c) (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) { - unsigned lower_word; - - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* Required by the SDM */ - sync_core(); - rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); - } + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) + c->microcode = intel_get_microcode_revision(); /* * Atom erratum AAE44/AAF40/AAG38/AAH41: @@ -124,8 +119,10 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (!check_tsc_unstable()) - set_sched_clock_stable(); + if (check_tsc_unstable()) + clear_sched_clock_stable(); + } else { + clear_sched_clock_stable(); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 83f1a98d37db..2eee85379689 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -52,8 +52,11 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) if (severity >= GHES_SEV_RECOVERABLE) m.status |= MCI_STATUS_UC; - if (severity >= GHES_SEV_PANIC) + + if (severity >= GHES_SEV_PANIC) { m.status |= MCI_STATUS_PCC; + m.tsc = rdtsc(); + } m.addr = mem_err->physical_addr; mce_log(&m); diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c index 93d824ec3120..1e5a50c11d3c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -72,7 +72,7 @@ struct llist_node *mce_gen_pool_prepare_records(void) return new_head.first; } -void mce_gen_pool_process(void) +void mce_gen_pool_process(struct work_struct *__unused) { struct llist_node *head; struct mce_evt_llist *node, *tmp; diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 517619ea6498..99165b206df3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -152,7 +152,6 @@ static void raise_mce(struct mce *m) if (context == MCJ_CTX_RANDOM) return; -#ifdef CONFIG_X86_LOCAL_APIC if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) { unsigned long start; int cpu; @@ -192,9 +191,7 @@ static void raise_mce(struct mce *m) raise_local(); put_cpu(); put_online_cpus(); - } else -#endif - { + } else { preempt_disable(); raise_local(); preempt_enable(); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index cd74a3f00aea..903043e6a62b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -31,7 +31,7 @@ struct mce_evt_llist { struct mce mce; }; -void mce_gen_pool_process(void); +void mce_gen_pool_process(struct work_struct *__unused); bool mce_gen_pool_empty(void); int mce_gen_pool_add(struct mce *mce); int mce_gen_pool_init(void); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 00ef43233e03..8e9725c607ea 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -128,7 +128,6 @@ void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); - m->tsc = rdtsc(); /* We hope get_seconds stays lockless */ m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; @@ -217,9 +216,7 @@ void mce_register_decode_chain(struct notifier_block *nb) { atomic_inc(&num_notifiers); - /* Ensure SRAO notifier has the highest priority in the decode chain. */ - if (nb != &mce_srao_nb && nb->priority == INT_MAX) - nb->priority -= 1; + WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC); atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); } @@ -583,7 +580,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, } static struct notifier_block mce_srao_nb = { .notifier_call = srao_decode_notifier, - .priority = INT_MAX, + .priority = MCE_PRIO_SRAO, }; static int mce_default_notifier(struct notifier_block *nb, unsigned long val, @@ -609,7 +606,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val, static struct notifier_block mce_default_nb = { .notifier_call = mce_default_notifier, /* lowest prio, we want it to run last. */ - .priority = 0, + .priority = MCE_PRIO_LOWEST, }; /* @@ -710,14 +707,8 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_gather_info(&m, NULL); - /* - * m.tsc was set in mce_setup(). Clear it if not requested. - * - * FIXME: Propagate @flags to mce_gather_info/mce_setup() to avoid - * that dance. - */ - if (!(flags & MCP_TIMESTAMP)) - m.tsc = 0; + if (flags & MCP_TIMESTAMP) + m.tsc = rdtsc(); for (i = 0; i < mca_cfg.banks; i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) @@ -1156,6 +1147,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) goto out; mce_gather_info(&m, regs); + m.tsc = rdtsc(); final = this_cpu_ptr(&mces_seen); *final = m; @@ -1322,41 +1314,6 @@ int memory_failure(unsigned long pfn, int vector, int flags) #endif /* - * Action optional processing happens here (picking up - * from the list of faulting pages that do_machine_check() - * placed into the genpool). - */ -static void mce_process_work(struct work_struct *dummy) -{ - mce_gen_pool_process(); -} - -#ifdef CONFIG_X86_MCE_INTEL -/*** - * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog - * @cpu: The CPU on which the event occurred. - * @status: Event status information - * - * This function should be called by the thermal interrupt after the - * event has been processed and the decision was made to log the event - * further. - * - * The status parameter will be saved to the 'status' field of 'struct mce' - * and historically has been the register value of the - * MSR_IA32_THERMAL_STATUS (Intel) msr. - */ -void mce_log_therm_throt_event(__u64 status) -{ - struct mce m; - - mce_setup(&m); - m.bank = MCE_THERMAL_BANK; - m.status = status; - mce_log(&m); -} -#endif /* CONFIG_X86_MCE_INTEL */ - -/* * Periodic polling timer for "silent" machine check errors. If the * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). @@ -1373,20 +1330,15 @@ static unsigned long mce_adjust_timer_default(unsigned long interval) static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; -static void __restart_timer(struct timer_list *t, unsigned long interval) +static void __start_timer(struct timer_list *t, unsigned long interval) { unsigned long when = jiffies + interval; unsigned long flags; local_irq_save(flags); - if (timer_pending(t)) { - if (time_before(when, t->expires)) - mod_timer(t, when); - } else { - t->expires = round_jiffies(when); - add_timer_on(t, smp_processor_id()); - } + if (!timer_pending(t) || time_before(when, t->expires)) + mod_timer(t, round_jiffies(when)); local_irq_restore(flags); } @@ -1421,7 +1373,7 @@ static void mce_timer_fn(unsigned long data) done: __this_cpu_write(mce_next_interval, iv); - __restart_timer(t, iv); + __start_timer(t, iv); } /* @@ -1432,7 +1384,7 @@ void mce_timer_kick(unsigned long interval) struct timer_list *t = this_cpu_ptr(&mce_timer); unsigned long iv = __this_cpu_read(mce_next_interval); - __restart_timer(t, interval); + __start_timer(t, interval); if (interval < iv) __this_cpu_write(mce_next_interval, interval); @@ -1779,17 +1731,15 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) } } -static void mce_start_timer(unsigned int cpu, struct timer_list *t) +static void mce_start_timer(struct timer_list *t) { unsigned long iv = check_interval * HZ; if (mca_cfg.ignore_ce || !iv) return; - per_cpu(mce_next_interval, cpu) = iv; - - t->expires = round_jiffies(jiffies + iv); - add_timer_on(t, cpu); + this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); } static void __mcheck_cpu_setup_timer(void) @@ -1806,7 +1756,7 @@ static void __mcheck_cpu_init_timer(void) unsigned int cpu = smp_processor_id(); setup_pinned_timer(t, mce_timer_fn, cpu); - mce_start_timer(cpu, t); + mce_start_timer(t); } /* Handle unconfigured int18 (should never happen) */ @@ -2196,7 +2146,7 @@ int __init mcheck_init(void) mce_register_decode_chain(&mce_default_nb); mcheck_vendor_init_severity(); - INIT_WORK(&mce_work, mce_process_work); + INIT_WORK(&mce_work, mce_gen_pool_process); init_irq_work(&mce_irq_work, mce_irq_work_cb); return 0; @@ -2566,7 +2516,7 @@ static int mce_cpu_dead(unsigned int cpu) static int mce_cpu_online(unsigned int cpu) { - struct timer_list *t = &per_cpu(mce_timer, cpu); + struct timer_list *t = this_cpu_ptr(&mce_timer); int ret; mce_device_create(cpu); @@ -2577,13 +2527,13 @@ static int mce_cpu_online(unsigned int cpu) return ret; } mce_reenable_cpu(); - mce_start_timer(cpu, t); + mce_start_timer(t); return 0; } static int mce_cpu_pre_down(unsigned int cpu) { - struct timer_list *t = &per_cpu(mce_timer, cpu); + struct timer_list *t = this_cpu_ptr(&mce_timer); mce_disable_cpu(); del_timer_sync(t); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index a5fd137417a2..9e5427df3243 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -192,6 +192,7 @@ static void get_smca_bank_info(unsigned int bank) smca_banks[bank].hwid = s_hwid; smca_banks[bank].id = instance_id; + smca_banks[bank].sysfs_id = s_hwid->count++; break; } } @@ -777,7 +778,8 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) mce_setup(&m); m.status = status; - m.bank = bank; + m.bank = bank; + m.tsc = rdtsc(); if (threshold_err) m.misc = misc; @@ -1064,9 +1066,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return NULL; } + if (smca_banks[bank].hwid->count == 1) + return smca_get_name(bank_type); + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "%s_%x", smca_get_name(bank_type), - smca_banks[bank].id); + smca_banks[bank].sysfs_id); return buf_mcatype; } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 465aca8be009..85469f84c921 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -6,7 +6,7 @@ * * Maintains a counter in /sys that keeps track of the number of thermal * events, such that the user knows how bad the thermal problem might be - * (since the logging to syslog and mcelog is rate limited). + * (since the logging to syslog is rate limited). * * Author: Dmitriy Zavin (dmitriyz@google.com) * @@ -141,13 +141,8 @@ static struct attribute_group thermal_attr_group = { * IRQ has been acknowledged. * * It will take care of rate limiting and printing messages to the syslog. - * - * Returns: 0 : Event should NOT be further logged, i.e. still in - * "timeout" from previous log message. - * 1 : Event should be logged further, and a message has been - * printed to the syslog. */ -static int therm_throt_process(bool new_event, int event, int level) +static void therm_throt_process(bool new_event, int event, int level) { struct _thermal_state *state; unsigned int this_cpu = smp_processor_id(); @@ -162,16 +157,16 @@ static int therm_throt_process(bool new_event, int event, int level) else if (event == POWER_LIMIT_EVENT) state = &pstate->core_power_limit; else - return 0; + return; } else if (level == PACKAGE_LEVEL) { if (event == THERMAL_THROTTLING_EVENT) state = &pstate->package_throttle; else if (event == POWER_LIMIT_EVENT) state = &pstate->package_power_limit; else - return 0; + return; } else - return 0; + return; old_event = state->new_event; state->new_event = new_event; @@ -181,7 +176,7 @@ static int therm_throt_process(bool new_event, int event, int level) if (time_before64(now, state->next_check) && state->count != state->last_count) - return 0; + return; state->next_check = now + CHECK_INTERVAL; state->last_count = state->count; @@ -193,16 +188,14 @@ static int therm_throt_process(bool new_event, int event, int level) this_cpu, level == CORE_LEVEL ? "Core" : "Package", state->count); - return 1; + return; } if (old_event) { if (event == THERMAL_THROTTLING_EVENT) pr_info("CPU%d: %s temperature/speed normal\n", this_cpu, level == CORE_LEVEL ? "Core" : "Package"); - return 1; + return; } - - return 0; } static int thresh_event_valid(int level, int event) @@ -365,10 +358,9 @@ static void intel_thermal_interrupt(void) /* Check for violation of core thermal thresholds*/ notify_thresholds(msr_val); - if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, - THERMAL_THROTTLING_EVENT, - CORE_LEVEL) != 0) - mce_log_therm_throt_event(msr_val); + therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + CORE_LEVEL); if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 6a31e2691f3a..079e81733a58 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -384,8 +384,9 @@ void load_ucode_amd_ap(unsigned int family) reget: if (!get_builtin_microcode(&cp, family)) { #ifdef CONFIG_BLK_DEV_INITRD - cp = find_cpio_data(ucode_path, (void *)initrd_start, - initrd_end - initrd_start, NULL); + if (!initrd_gone) + cp = find_cpio_data(ucode_path, (void *)initrd_start, + initrd_end - initrd_start, NULL); #endif if (!(cp.data && cp.size)) { /* diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 2af69d27da62..73102d932760 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -46,6 +46,8 @@ static struct microcode_ops *microcode_ops; static bool dis_ucode_ldr = true; +bool initrd_gone; + LIST_HEAD(microcode_cache); /* @@ -190,21 +192,24 @@ void load_ucode_ap(void) static int __init save_microcode_in_initrd(void) { struct cpuinfo_x86 *c = &boot_cpu_data; + int ret = -EINVAL; switch (c->x86_vendor) { case X86_VENDOR_INTEL: if (c->x86 >= 6) - return save_microcode_in_initrd_intel(); + ret = save_microcode_in_initrd_intel(); break; case X86_VENDOR_AMD: if (c->x86 >= 0x10) - return save_microcode_in_initrd_amd(c->x86); + ret = save_microcode_in_initrd_amd(c->x86); break; default: break; } - return -EINVAL; + initrd_gone = true; + + return ret; } struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) @@ -247,9 +252,16 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) * has the virtual address of the beginning of the initrd. It also * possibly relocates the ramdisk. In either case, initrd_start contains * the updated address so use that instead. + * + * initrd_gone is for the hotplug case where we've thrown out initrd + * already. */ - if (!use_pa && initrd_start) - start = initrd_start; + if (!use_pa) { + if (initrd_gone) + return (struct cpio_data){ NULL, 0, "" }; + if (initrd_start) + start = initrd_start; + } return find_cpio_data(path, (void *)start, size, NULL); #else /* !CONFIG_BLK_DEV_INITRD */ diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index b624b54912e1..8325d8a09ab0 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -41,7 +41,7 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; -/* Current microcode patch used in early patching */ +/* Current microcode patch used in early patching on the APs. */ struct microcode_intel *intel_ucode_patch; static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, @@ -150,7 +150,7 @@ static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size) { struct ucode_patch *p; - p = kzalloc(size, GFP_KERNEL); + p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); @@ -368,26 +368,6 @@ next: return patch; } -static void cpuid_1(void) -{ - /* - * According to the Intel SDM, Volume 3, 9.11.7: - * - * CPUID returns a value in a model specific register in - * addition to its usual register return values. The - * semantics of CPUID cause it to deposit an update ID value - * in the 64-bit model-specific register at address 08BH - * (IA32_BIOS_SIGN_ID). If no update is present in the - * processor, the value in the MSR remains unmodified. - * - * Use native_cpuid -- this code runs very early and we don't - * want to mess with paravirt. - */ - unsigned int eax = 1, ebx, ecx = 0, edx; - - native_cpuid(&eax, &ebx, &ecx, &edx); -} - static int collect_cpu_info_early(struct ucode_cpu_info *uci) { unsigned int val[2]; @@ -410,15 +390,8 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); csig.pf = 1 << ((val[1] >> 18) & 7); } - native_wrmsrl(MSR_IA32_UCODE_REV, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - cpuid_1(); - - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - csig.rev = val[1]; + csig.rev = intel_get_microcode_revision(); uci->cpu_sig = csig; uci->valid = 1; @@ -602,7 +575,7 @@ static inline void print_ucode(struct ucode_cpu_info *uci) static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) { struct microcode_intel *mc; - unsigned int val[2]; + u32 rev; mc = uci->mc; if (!mc) @@ -610,21 +583,16 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) /* write microcode via MSR 0x79 */ native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); - native_wrmsrl(MSR_IA32_UCODE_REV, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - cpuid_1(); - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (val[1] != mc->hdr.rev) + rev = intel_get_microcode_revision(); + if (rev != mc->hdr.rev) return -1; #ifdef CONFIG_X86_64 /* Flush global tlb. This is precaution. */ flush_tlb_early(); #endif - uci->cpu_sig.rev = val[1]; + uci->cpu_sig.rev = rev; if (early) print_ucode(uci); @@ -639,12 +607,6 @@ int __init save_microcode_in_initrd_intel(void) struct ucode_cpu_info uci; struct cpio_data cp; - /* - * AP loading didn't find any microcode patch, no need to save anything. - */ - if (!intel_ucode_patch || IS_ERR(intel_ucode_patch)) - return 0; - if (!load_builtin_intel_microcode(&cp)) cp = find_microcode_in_initrd(ucode_path, false); @@ -660,7 +622,6 @@ int __init save_microcode_in_initrd_intel(void) return 0; } - /* * @res_patch, output: a pointer to the patch we found. */ @@ -804,8 +765,8 @@ static int apply_microcode_intel(int cpu) struct microcode_intel *mc; struct ucode_cpu_info *uci; struct cpuinfo_x86 *c; - unsigned int val[2]; static int prev_rev; + u32 rev; /* We should bind the task to the CPU */ if (WARN_ON(raw_smp_processor_id() != cpu)) @@ -822,33 +783,28 @@ static int apply_microcode_intel(int cpu) /* write microcode via MSR 0x79 */ wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); - wrmsrl(MSR_IA32_UCODE_REV, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - cpuid_1(); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + rev = intel_get_microcode_revision(); - if (val[1] != mc->hdr.rev) { + if (rev != mc->hdr.rev) { pr_err("CPU%d update to revision 0x%x failed\n", cpu, mc->hdr.rev); return -1; } - if (val[1] != prev_rev) { + if (rev != prev_rev) { pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", - val[1], + rev, mc->hdr.date & 0xffff, mc->hdr.date >> 24, (mc->hdr.date >> 16) & 0xff); - prev_rev = val[1]; + prev_rev = rev; } c = &cpu_data(cpu); - uci->cpu_sig.rev = val[1]; - c->microcode = val[1]; + uci->cpu_sig.rev = rev; + c->microcode = rev; return 0; } @@ -860,7 +816,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; - unsigned int curr_mc_size = 0; + unsigned int curr_mc_size = 0, new_mc_size = 0; unsigned int csig, cpf; while (leftover) { @@ -901,6 +857,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; + new_mc_size = mc_size; mc = NULL; /* trigger new vmalloc */ } @@ -926,7 +883,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, * permanent memory. So it will be loaded early when a CPU is hot added * or resumes. */ - save_mc_for_early(new_mc, curr_mc_size); + save_mc_for_early(new_mc, new_mc_size); pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 34178564be2a..c1ea5b999839 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -1,4 +1,5 @@ #include <linux/kernel.h> +#include <linux/sched.h> #include <linux/mm.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -14,6 +15,8 @@ static void early_init_transmeta(struct cpuinfo_x86 *c) if (xlvl >= 0x80860001) c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001); } + + clear_sched_clock_stable(); } static void init_transmeta(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e4e97a5355ce..de7234401275 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -9,6 +9,7 @@ #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> +#include <asm/fpu/xstate.h> #include <asm/traps.h> #include <linux/hardirq.h> @@ -183,7 +184,8 @@ void fpstate_init(union fpregs_state *state) * it will #GP. Make sure it is replaced after the memset(). */ if (static_cpu_has(X86_FEATURE_XSAVES)) - state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT; + state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | + xfeatures_mask; if (static_cpu_has(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 85e87b46c318..dc6ba5bda9fc 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -352,6 +352,7 @@ static int hpet_resume(struct clock_event_device *evt, int timer) } else { struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq)); irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); disable_irq(hdev->irq); irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index cb9c1ed1d391..f73f475d0573 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -132,10 +132,8 @@ int sched_set_itmt_support(void) sysctl_sched_itmt_enabled = 1; - if (sysctl_sched_itmt_enabled) { - x86_topology_update = true; - rebuild_sched_domains(); - } + x86_topology_update = true; + rebuild_sched_domains(); mutex_unlock(&itmt_update_mutex); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index eb3509338ae0..520b8dfe1640 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -745,7 +745,7 @@ __visible __used void *trampoline_handler(struct pt_regs *regs) * will be the real return address, and all the rest will * point to kretprobe_trampoline. */ - hlist_for_each_entry_safe(ri, tmp, head, hlist) { + hlist_for_each_entry(ri, head, hlist) { if (ri->task != current) /* another task is sharing our hash bucket */ continue; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2a5cafdf8808..542710b99f52 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -107,12 +107,12 @@ static inline void kvm_sched_clock_init(bool stable) { if (!stable) { pv_time_ops.sched_clock = kvm_clock_read; + clear_sched_clock_stable(); return; } kvm_sched_clock_offset = kvm_clock_read(); pv_time_ops.sched_clock = kvm_sched_clock_read; - set_sched_clock_stable(); printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", kvm_sched_clock_offset); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4cfba947d774..69780edf0dde 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1176,6 +1176,20 @@ void __init setup_arch(char **cmdline_p) /* Allocate bigger log buffer */ setup_log_buf(1); + if (efi_enabled(EFI_BOOT)) { + switch (boot_params.secure_boot) { + case efi_secureboot_mode_disabled: + pr_info("Secure boot disabled\n"); + break; + case efi_secureboot_mode_enabled: + pr_info("Secure boot enabled\n"); + break; + default: + pr_info("Secure boot could not be determined\n"); + break; + } + } + reserve_initrd(); acpi_table_upgrade(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 46732dc3b73c..99b920d0e516 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -433,9 +433,15 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) int cpu1 = c->cpu_index, cpu2 = o->cpu_index; if (c->phys_proc_id == o->phys_proc_id && - per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) && - c->cpu_core_id == o->cpu_core_id) - return topology_sane(c, o, "smt"); + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) { + if (c->cpu_core_id == o->cpu_core_id) + return topology_sane(c, o, "smt"); + + if ((c->cu_id != 0xff) && + (o->cu_id != 0xff) && + (c->cu_id == o->cu_id)) + return topology_sane(c, o, "smt"); + } } else if (c->phys_proc_id == o->phys_proc_id && c->cpu_core_id == o->cpu_core_id) { diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index be3a49ee0356..2724dc82f992 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -694,6 +694,7 @@ unsigned long native_calibrate_tsc(void) crystal_khz = 24000; /* 24.0 MHz */ break; case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_ATOM_DENVERTON: crystal_khz = 25000; /* 25.0 MHz */ break; case INTEL_FAM6_ATOM_GOLDMONT: @@ -1106,6 +1107,16 @@ static u64 read_tsc(struct clocksource *cs) return (u64)rdtsc_ordered(); } +static void tsc_cs_mark_unstable(struct clocksource *cs) +{ + if (tsc_unstable) + return; + tsc_unstable = 1; + clear_sched_clock_stable(); + disable_sched_clock_irqtime(); + pr_info("Marking TSC unstable due to clocksource watchdog\n"); +} + /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ @@ -1118,6 +1129,7 @@ static struct clocksource clocksource_tsc = { CLOCK_SOURCE_MUST_VERIFY, .archdata = { .vclock_mode = VCLOCK_TSC }, .resume = tsc_resume, + .mark_unstable = tsc_cs_mark_unstable, }; void mark_tsc_unstable(char *reason) @@ -1355,6 +1367,9 @@ void __init tsc_init(void) (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); + /* Sanitize TSC ADJUST before cyc2ns gets initialized */ + tsc_store_and_check_tsc_adjust(true); + /* * Secondary CPUs do not run through tsc_init(), so set up * all the scale factors for all CPUs, assuming the same @@ -1385,8 +1400,6 @@ void __init tsc_init(void) if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); - else - tsc_store_and_check_tsc_adjust(true); check_system_tsc_reliable(); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index d0db011051a5..728f75378475 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -286,13 +286,6 @@ void check_tsc_sync_source(int cpu) if (unsynchronized_tsc()) return; - if (tsc_clocksource_reliable) { - if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) - pr_info( - "Skipped synchronization checks as TSC is reliable.\n"); - return; - } - /* * Set the maximum number of test runs to * 1 if the CPU does not provide the TSC_ADJUST MSR @@ -380,14 +373,19 @@ void check_tsc_sync_target(void) int cpus = 2; /* Also aborts if there is no TSC. */ - if (unsynchronized_tsc() || tsc_clocksource_reliable) + if (unsynchronized_tsc()) return; /* * Store, verify and sanitize the TSC adjust register. If * successful skip the test. + * + * The test is also skipped when the TSC is marked reliable. This + * is true for SoCs which have no fallback clocksource. On these + * SoCs the TSC is frequency synchronized, but still the TSC ADJUST + * register might have been wreckaged by the BIOS.. */ - if (tsc_store_and_check_tsc_adjust(false)) { + if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) { atomic_inc(&skip_test); return; } diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 4443e499f279..23d15565d02a 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -6,6 +6,21 @@ #define FRAME_HEADER_SIZE (sizeof(long) * 2) +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + if (task == current) \ + val = READ_ONCE(x); \ + else \ + val = READ_ONCE_NOCHECK(x); \ + val; \ +}) + static void unwind_dump(struct unwind_state *state, unsigned long *sp) { static bool dumped_before = false; @@ -48,7 +63,8 @@ unsigned long unwind_get_return_address(struct unwind_state *state) if (state->regs && user_mode(state->regs)) return 0; - addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p, + addr = READ_ONCE_TASK_STACK(state->task, *addr_p); + addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, addr_p); return __kernel_text_address(addr) ? addr : 0; @@ -162,7 +178,7 @@ bool unwind_next_frame(struct unwind_state *state) if (state->regs) next_bp = (unsigned long *)state->regs->bp; else - next_bp = (unsigned long *)*state->bp; + next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task,*state->bp); /* is the next frame pointer an encoded pointer to pt_regs? */ regs = decode_frame_pointer(next_bp); @@ -207,6 +223,16 @@ bool unwind_next_frame(struct unwind_state *state) return true; bad_address: + /* + * When unwinding a non-current task, the task might actually be + * running on another CPU, in which case it could be modifying its + * stack while we're reading it. This is generally not a problem and + * can be ignored as long as the caller understands that unwinding + * another task will not always succeed. + */ + if (state->task != current) + goto the_end; + if (state->regs) { printk_deferred_once(KERN_WARNING "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index ec5d7545e6dc..0442d98367ae 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -160,11 +160,12 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) static void mark_screen_rdonly(struct mm_struct *mm) { + struct vm_area_struct *vma; + spinlock_t *ptl; pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - spinlock_t *ptl; int i; down_write(&mm->mmap_sem); @@ -177,7 +178,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) pmd = pmd_offset(pud, 0xA0000); if (pmd_trans_huge(*pmd)) { - struct vm_area_struct *vma = find_vma(mm, 0xA0000); + vma = find_vma(mm, 0xA0000); split_huge_pmd(vma, pmd, 0xA0000); } if (pmd_none_or_clear_bad(pmd)) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 56628a44668b..cedbba0f3402 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -818,6 +818,20 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); } +static int segmented_write_std(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + void *data, + unsigned int size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, true, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception); +} + /* * Prefetch the remaining bytes of the instruction without crossing page * boundary if they are not in fetch_cache yet. @@ -1571,7 +1585,6 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, &ctxt->exception); } -/* Does not support long mode */ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg, u8 cpl, enum x86_transfer_type transfer, @@ -1608,20 +1621,34 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, rpl = selector & 3; - /* NULL selector is not valid for TR, CS and SS (except for long mode) */ - if ((seg == VCPU_SREG_CS - || (seg == VCPU_SREG_SS - && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)) - || seg == VCPU_SREG_TR) - && null_selector) - goto exception; - /* TR should be in GDT only */ if (seg == VCPU_SREG_TR && (selector & (1 << 2))) goto exception; - if (null_selector) /* for NULL selector skip all following checks */ + /* NULL selector is not valid for TR, CS and (except for long mode) SS */ + if (null_selector) { + if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR) + goto exception; + + if (seg == VCPU_SREG_SS) { + if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl) + goto exception; + + /* + * ctxt->ops->set_segment expects the CPL to be in + * SS.DPL, so fake an expand-up 32-bit data segment. + */ + seg_desc.type = 3; + seg_desc.p = 1; + seg_desc.s = 1; + seg_desc.dpl = cpl; + seg_desc.d = 1; + seg_desc.g = 1; + } + + /* Skip all following checks */ goto load; + } ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) @@ -1737,6 +1764,21 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { u8 cpl = ctxt->ops->cpl(ctxt); + + /* + * None of MOV, POP and LSS can load a NULL selector in CPL=3, but + * they can load it at CPL<3 (Intel's manual says only LSS can, + * but it's wrong). + * + * However, the Intel manual says that putting IST=1/DPL=3 in + * an interrupt gate will result in SS=3 (the AMD manual instead + * says it doesn't), so allow SS=3 in __load_segment_descriptor + * and only forbid it here. + */ + if (seg == VCPU_SREG_SS && selector == 3 && + ctxt->mode == X86EMUL_MODE_PROT64) + return emulate_exception(ctxt, GP_VECTOR, 0, true); + return __load_segment_descriptor(ctxt, selector, seg, cpl, X86_TRANSFER_NONE, NULL); } @@ -3685,8 +3727,8 @@ static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt, } /* Disable writeback. */ ctxt->dst.type = OP_NONE; - return segmented_write(ctxt, ctxt->dst.addr.mem, - &desc_ptr, 2 + ctxt->op_bytes); + return segmented_write_std(ctxt, ctxt->dst.addr.mem, + &desc_ptr, 2 + ctxt->op_bytes); } static int em_sgdt(struct x86_emulate_ctxt *ctxt) @@ -3932,7 +3974,7 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) else size = offsetof(struct fxregs_state, xmm_space[0]); - return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size); + return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); } static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, @@ -3974,7 +4016,7 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512); + rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512); if (rc != X86EMUL_CONTINUE) return rc; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 1572c35b4f1a..2ecd7dab4631 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -964,10 +964,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, /* Calculate cpu time spent by current task in 100ns units */ static u64 current_task_runtime_100ns(void) { - cputime_t utime, stime; + u64 utime, stime; task_cputime_adjusted(current, &utime, &stime); - return div_u64(cputime_to_nsecs(utime + stime), 100); + + return div_u64(utime + stime, 100); } static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5fe290c1b7d8..2f6ef5121a4c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2426,3 +2426,9 @@ void kvm_lapic_init(void) jump_label_rate_limit(&apic_hw_disabled, HZ); jump_label_rate_limit(&apic_sw_disabled, HZ); } + +void kvm_lapic_exit(void) +{ + static_key_deferred_flush(&apic_hw_disabled); + static_key_deferred_flush(&apic_sw_disabled); +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index e0c80233b3e1..ff8039d61672 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -110,6 +110,7 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); void kvm_lapic_init(void); +void kvm_lapic_exit(void); #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2f22810a7e0c..e52c9088660f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3182,6 +3182,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) memcpy(dest, xsave, XSAVE_HDR_OFFSET); /* Set XSTATE_BV */ + xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE; *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv; /* @@ -3342,6 +3343,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, switch (cap->cap) { case KVM_CAP_HYPERV_SYNIC: + if (!irqchip_in_kernel(vcpu->kvm)) + return -EINVAL; return kvm_hv_activate_synic(vcpu); default: return -EINVAL; @@ -6045,6 +6048,7 @@ out: void kvm_arch_exit(void) { + kvm_lapic_exit(); perf_unregister_guest_info_callbacks(&kvm_guest_cbs); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) @@ -6168,7 +6172,8 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); + return emulator_write_emulated(ctxt, rip, instruction, 3, + &ctxt->exception); } static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 073d1f1a620b..a8e91ae89fb3 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -156,13 +156,13 @@ EXPORT_SYMBOL(__delay); inline void __const_udelay(unsigned long xloops) { + unsigned long lpj = this_cpu_read(cpu_info.loops_per_jiffy) ? : loops_per_jiffy; int d0; xloops *= 4; asm("mull %%edx" :"=d" (xloops), "=&a" (d0) - :"1" (xloops), "0" - (this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4))); + :"1" (xloops), "0" (lpj * (HZ / 4))); __delay(++xloops); } diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index ea9c49adaa1f..8aa6bea1cd6c 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -15,6 +15,7 @@ #include <linux/debugfs.h> #include <linux/mm.h> #include <linux/init.h> +#include <linux/sched.h> #include <linux/seq_file.h> #include <asm/pgtable.h> @@ -406,6 +407,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, } else note_page(m, &st, __pgprot(0), 1); + cond_resched(); start++; } diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 324e5713d386..af59f808742f 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -293,7 +293,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) * We were not able to extract an address from the instruction, * probably because there was something invalid in it. */ - if (info->si_addr == (void *)-1) { + if (info->si_addr == (void __user *)-1) { err = -EINVAL; goto err_out; } diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 3cd69832d7f4..3961103e9176 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -114,6 +114,16 @@ static const struct dmi_system_id pci_crs_quirks[] __initconst = { DMI_MATCH(DMI_BIOS_VERSION, "6JET85WW (1.43 )"), }, }, + /* https://bugzilla.kernel.org/show_bug.cgi?id=42606 */ + { + .callback = set_nouse_crs, + .ident = "Supermicro X8DTH", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"), + DMI_MATCH(DMI_PRODUCT_NAME, "X8DTH-i/6/iF/6F"), + DMI_MATCH(DMI_BIOS_VERSION, "2.0a"), + }, + }, /* https://bugzilla.kernel.org/show_bug.cgi?id=15362 */ { diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index 6aad870e8962..04ca8764f0c0 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -19,8 +19,7 @@ #include <linux/efi.h> #include <linux/efi-bgrt.h> -struct acpi_table_bgrt *bgrt_tab; -void *__initdata bgrt_image; +struct acpi_table_bgrt bgrt_tab; size_t __initdata bgrt_image_size; struct bmp_header { @@ -28,66 +27,58 @@ struct bmp_header { u32 size; } __packed; -void __init efi_bgrt_init(void) +void __init efi_bgrt_init(struct acpi_table_header *table) { - acpi_status status; void *image; struct bmp_header bmp_header; + struct acpi_table_bgrt *bgrt = &bgrt_tab; if (acpi_disabled) return; - status = acpi_get_table("BGRT", 0, - (struct acpi_table_header **)&bgrt_tab); - if (ACPI_FAILURE(status)) - return; - - if (bgrt_tab->header.length < sizeof(*bgrt_tab)) { + if (table->length < sizeof(bgrt_tab)) { pr_notice("Ignoring BGRT: invalid length %u (expected %zu)\n", - bgrt_tab->header.length, sizeof(*bgrt_tab)); + table->length, sizeof(bgrt_tab)); return; } - if (bgrt_tab->version != 1) { + *bgrt = *(struct acpi_table_bgrt *)table; + if (bgrt->version != 1) { pr_notice("Ignoring BGRT: invalid version %u (expected 1)\n", - bgrt_tab->version); - return; + bgrt->version); + goto out; } - if (bgrt_tab->status & 0xfe) { + if (bgrt->status & 0xfe) { pr_notice("Ignoring BGRT: reserved status bits are non-zero %u\n", - bgrt_tab->status); - return; + bgrt->status); + goto out; } - if (bgrt_tab->image_type != 0) { + if (bgrt->image_type != 0) { pr_notice("Ignoring BGRT: invalid image type %u (expected 0)\n", - bgrt_tab->image_type); - return; + bgrt->image_type); + goto out; } - if (!bgrt_tab->image_address) { + if (!bgrt->image_address) { pr_notice("Ignoring BGRT: null image address\n"); - return; + goto out; } - image = memremap(bgrt_tab->image_address, sizeof(bmp_header), MEMREMAP_WB); + image = early_memremap(bgrt->image_address, sizeof(bmp_header)); if (!image) { pr_notice("Ignoring BGRT: failed to map image header memory\n"); - return; + goto out; } memcpy(&bmp_header, image, sizeof(bmp_header)); - memunmap(image); + early_memunmap(image, sizeof(bmp_header)); if (bmp_header.id != 0x4d42) { pr_notice("Ignoring BGRT: Incorrect BMP magic number 0x%x (expected 0x4d42)\n", bmp_header.id); - return; + goto out; } bgrt_image_size = bmp_header.size; + efi_mem_reserve(bgrt->image_address, bgrt_image_size); - bgrt_image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB); - if (!bgrt_image) { - pr_notice("Ignoring BGRT: failed to map image memory\n"); - bgrt_image = NULL; - return; - } - - efi_mem_reserve(bgrt_tab->image_address, bgrt_image_size); + return; +out: + memset(bgrt, 0, sizeof(bgrt_tab)); } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 936a488d6cf6..565dff3c9a12 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -210,6 +210,70 @@ int __init efi_memblock_x86_reserve_range(void) return 0; } +#define OVERFLOW_ADDR_SHIFT (64 - EFI_PAGE_SHIFT) +#define OVERFLOW_ADDR_MASK (U64_MAX << OVERFLOW_ADDR_SHIFT) +#define U64_HIGH_BIT (~(U64_MAX >> 1)) + +static bool __init efi_memmap_entry_valid(const efi_memory_desc_t *md, int i) +{ + u64 end = (md->num_pages << EFI_PAGE_SHIFT) + md->phys_addr - 1; + u64 end_hi = 0; + char buf[64]; + + if (md->num_pages == 0) { + end = 0; + } else if (md->num_pages > EFI_PAGES_MAX || + EFI_PAGES_MAX - md->num_pages < + (md->phys_addr >> EFI_PAGE_SHIFT)) { + end_hi = (md->num_pages & OVERFLOW_ADDR_MASK) + >> OVERFLOW_ADDR_SHIFT; + + if ((md->phys_addr & U64_HIGH_BIT) && !(end & U64_HIGH_BIT)) + end_hi += 1; + } else { + return true; + } + + pr_warn_once(FW_BUG "Invalid EFI memory map entries:\n"); + + if (end_hi) { + pr_warn("mem%02u: %s range=[0x%016llx-0x%llx%016llx] (invalid)\n", + i, efi_md_typeattr_format(buf, sizeof(buf), md), + md->phys_addr, end_hi, end); + } else { + pr_warn("mem%02u: %s range=[0x%016llx-0x%016llx] (invalid)\n", + i, efi_md_typeattr_format(buf, sizeof(buf), md), + md->phys_addr, end); + } + return false; +} + +static void __init efi_clean_memmap(void) +{ + efi_memory_desc_t *out = efi.memmap.map; + const efi_memory_desc_t *in = out; + const efi_memory_desc_t *end = efi.memmap.map_end; + int i, n_removal; + + for (i = n_removal = 0; in < end; i++) { + if (efi_memmap_entry_valid(in, i)) { + if (out != in) + memcpy(out, in, efi.memmap.desc_size); + out = (void *)out + efi.memmap.desc_size; + } else { + n_removal++; + } + in = (void *)in + efi.memmap.desc_size; + } + + if (n_removal > 0) { + u64 size = efi.memmap.nr_map - n_removal; + + pr_warn("Removing %d invalid memory map entries.\n", n_removal); + efi_memmap_install(efi.memmap.phys_map, size); + } +} + void __init efi_print_memmap(void) { efi_memory_desc_t *md; @@ -472,15 +536,12 @@ void __init efi_init(void) } } + efi_clean_memmap(); + if (efi_enabled(EFI_DBG)) efi_print_memmap(); } -void __init efi_late_init(void) -{ - efi_bgrt_init(); -} - void __init efi_set_executable(efi_memory_desc_t *md, bool executable) { u64 addr, npages; @@ -894,6 +955,11 @@ static void __init __efi_enter_virtual_mode(void) return; } + if (efi_enabled(EFI_DBG)) { + pr_info("EFI runtime memory map:\n"); + efi_print_memmap(); + } + BUG_ON(!efi.systab); if (efi_setup_page_tables(pa, 1 << pg_shift)) { diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 319148bd4b05..a4695da42d77 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -269,6 +269,22 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) efi_scratch.use_pgd = true; /* + * Certain firmware versions are way too sentimential and still believe + * they are exclusive and unquestionable owners of the first physical page, + * even though they explicitly mark it as EFI_CONVENTIONAL_MEMORY + * (but then write-access it later during SetVirtualAddressMap()). + * + * Create a 1:1 mapping for this page, to avoid triple faults during early + * boot with such firmware. We are free to hand this page to the BIOS, + * as trim_bios_range() will reserve the first page and isolate it away + * from memory allocators anyway. + */ + if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, _PAGE_RW)) { + pr_err("Failed to create 1:1 mapping for the first page!\n"); + return 1; + } + + /* * When making calls to the firmware everything needs to be 1:1 * mapped and addressable with 32-bit pointers. Map the kernel * text and allocate a new stack because we can't rely on the @@ -398,10 +414,44 @@ void __init parse_efi_setup(u64 phys_addr, u32 data_len) efi_setup = phys_addr + sizeof(struct setup_data); } -void __init efi_runtime_update_mappings(void) +static int __init efi_update_mappings(efi_memory_desc_t *md, unsigned long pf) { unsigned long pfn; pgd_t *pgd = efi_pgd; + int err1, err2; + + /* Update the 1:1 mapping */ + pfn = md->phys_addr >> PAGE_SHIFT; + err1 = kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, md->num_pages, pf); + if (err1) { + pr_err("Error while updating 1:1 mapping PA 0x%llx -> VA 0x%llx!\n", + md->phys_addr, md->virt_addr); + } + + err2 = kernel_map_pages_in_pgd(pgd, pfn, md->virt_addr, md->num_pages, pf); + if (err2) { + pr_err("Error while updating VA mapping PA 0x%llx -> VA 0x%llx!\n", + md->phys_addr, md->virt_addr); + } + + return err1 || err2; +} + +static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *md) +{ + unsigned long pf = 0; + + if (md->attribute & EFI_MEMORY_XP) + pf |= _PAGE_NX; + + if (!(md->attribute & EFI_MEMORY_RO)) + pf |= _PAGE_RW; + + return efi_update_mappings(md, pf); +} + +void __init efi_runtime_update_mappings(void) +{ efi_memory_desc_t *md; if (efi_enabled(EFI_OLD_MEMMAP)) { @@ -410,6 +460,24 @@ void __init efi_runtime_update_mappings(void) return; } + /* + * Use the EFI Memory Attribute Table for mapping permissions if it + * exists, since it is intended to supersede EFI_PROPERTIES_TABLE. + */ + if (efi_enabled(EFI_MEM_ATTR)) { + efi_memattr_apply_permissions(NULL, efi_update_mem_attr); + return; + } + + /* + * EFI_MEMORY_ATTRIBUTES_TABLE is intended to replace + * EFI_PROPERTIES_TABLE. So, use EFI_PROPERTIES_TABLE to update + * permissions only if EFI_MEMORY_ATTRIBUTES_TABLE is not + * published by the firmware. Even if we find a buggy implementation of + * EFI_MEMORY_ATTRIBUTES_TABLE, don't fall back to + * EFI_PROPERTIES_TABLE, because of the same reason. + */ + if (!efi_enabled(EFI_NX_PE_DATA)) return; @@ -430,15 +498,7 @@ void __init efi_runtime_update_mappings(void) (md->type != EFI_RUNTIME_SERVICES_CODE)) pf |= _PAGE_RW; - /* Update the 1:1 mapping */ - pfn = md->phys_addr >> PAGE_SHIFT; - if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, md->num_pages, pf)) - pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n", - md->phys_addr, md->virt_addr); - - if (kernel_map_pages_in_pgd(pgd, pfn, md->virt_addr, md->num_pages, pf)) - pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n", - md->phys_addr, md->virt_addr); + efi_update_mappings(md, pf); } } diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 10aca63a50d7..30031d5293c4 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -214,7 +214,7 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) new_size = efi.memmap.desc_size * num_entries; - new_phys = memblock_alloc(new_size, 0); + new_phys = efi_memmap_alloc(num_entries); if (!new_phys) { pr_err("Could not allocate boot services memmap\n"); return; @@ -355,7 +355,7 @@ void __init efi_free_boot_services(void) } new_size = efi.memmap.desc_size * num_entries; - new_phys = memblock_alloc(new_size, 0); + new_phys = efi_memmap_alloc(num_entries); if (!new_phys) { pr_err("Failed to allocate new EFI memmap\n"); return; diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index 61b5ed2b7d40..90e4f2a6625b 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -15,7 +15,7 @@ obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_msic_power_btn.o obj-$(subst m,y,$(CONFIG_GPIO_INTEL_PMIC)) += platform_pmic_gpio.o obj-$(subst m,y,$(CONFIG_INTEL_MFLD_THERMAL)) += platform_msic_thermal.o # SPI Devices -obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_spidev.o +obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_mrfld_spidev.o # I2C Devices obj-$(subst m,y,$(CONFIG_SENSORS_EMC1403)) += platform_emc1403.o obj-$(subst m,y,$(CONFIG_SENSORS_LIS3LV02D)) += platform_lis331.o diff --git a/arch/x86/platform/intel-mid/device_libs/platform_spidev.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c index 30c601b399ee..27186ad654c9 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_spidev.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c @@ -11,6 +11,7 @@ * of the License. */ +#include <linux/err.h> #include <linux/init.h> #include <linux/sfi.h> #include <linux/spi/pxa2xx_spi.h> @@ -34,6 +35,9 @@ static void __init *spidev_platform_data(void *info) { struct spi_board_info *spi_info = info; + if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) + return ERR_PTR(-ENODEV); + spi_info->mode = SPI_MODE_0; spi_info->controller_data = &spidev_spi_chip; diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index d957d5f21a86..0bc60a308730 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -1,6 +1,6 @@ config MCE_AMD_INJ tristate "Simple MCE injection interface for AMD processors" - depends on RAS && EDAC_DECODE_MCE && DEBUG_FS && AMD_NB + depends on RAS && X86_MCE && DEBUG_FS && AMD_NB default n help This is a simple debugfs interface to inject MCEs and test different |