diff options
Diffstat (limited to 'arch/x86')
103 files changed, 2054 insertions, 1221 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1a0be022f91d..45b94fa9e98c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -119,6 +119,7 @@ config X86 select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL + select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS if MMU @@ -2422,7 +2423,7 @@ menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER def_bool y - depends on X86_64 && HIBERNATION + depends on HIBERNATION source "kernel/power/Kconfig" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 8f6e7eb8ae9f..5b562e464009 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -193,7 +193,6 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) -asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1) @@ -237,6 +236,13 @@ archscripts: scripts_basic archheaders: $(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all +archmacros: + $(Q)$(MAKE) $(build)=arch/x86/kernel arch/x86/kernel/macros.s + +ASM_MACRO_FLAGS = -Wa,arch/x86/kernel/macros.s -Wa,- +export ASM_MACRO_FLAGS +KBUILD_CFLAGS += $(ASM_MACRO_FLAGS) + ### # Kernel objects diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 28764dacf018..466f66c8a7f8 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -37,6 +37,7 @@ KBUILD_CFLAGS += $(call cc-option,-ffreestanding) KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) +KBUILD_CFLAGS += -Wno-pointer-sign KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 1458b1700fc7..8b4c5e001157 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -738,6 +738,7 @@ efi_main(struct efi_config *c, struct boot_params *boot_params) struct desc_struct *desc; void *handle; efi_system_table_t *_table; + unsigned long cmdline_paddr; efi_early = c; @@ -756,6 +757,15 @@ efi_main(struct efi_config *c, struct boot_params *boot_params) setup_boot_services32(efi_early); /* + * make_boot_params() may have been called before efi_main(), in which + * case this is the second time we parse the cmdline. This is ok, + * parsing the cmdline multiple times does not have side-effects. + */ + cmdline_paddr = ((u64)hdr->cmd_line_ptr | + ((u64)boot_params->ext_cmd_line_ptr << 32)); + efi_parse_options((char *)cmdline_paddr); + + /* * If the boot loader gave us a value for secure_boot then we use that, * otherwise we ask the BIOS. */ diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index d4e6cd4577e5..bf0e82400358 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -391,6 +391,13 @@ int main(int argc, char ** argv) die("Unable to mmap '%s': %m", argv[2]); /* Number of 16-byte paragraphs, including space for a 4-byte CRC */ sys_size = (sz + 15 + 4) / 16; +#ifdef CONFIG_EFI_STUB + /* + * COFF requires minimum 32-byte alignment of sections, and + * adding a signature is problematic without that alignment. + */ + sys_size = (sys_size + 1) & ~1; +#endif /* Patch the setup code with the appropriate size parameters */ buf[0x1f1] = setup_sectors-1; diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 0eb9f92f3717..6c3ab05c231d 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -247,6 +247,7 @@ CONFIG_USB_HIDDEV=y CONFIG_USB=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y +CONFIG_USB_XHCI_HCD=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_EHCI_TT_NEWSCHED=y CONFIG_USB_OHCI_HCD=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index e32fc1f274d8..ac9ae487cfeb 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -243,6 +243,7 @@ CONFIG_USB_HIDDEV=y CONFIG_USB=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y +CONFIG_USB_XHCI_HCD=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_EHCI_TT_NEWSCHED=y CONFIG_USB_OHCI_HCD=y diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 352e70cd33e8..708b46a54578 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -338,7 +338,7 @@ For 32-bit we have the following conventions - kernel is built with .macro CALL_enter_from_user_mode #ifdef CONFIG_CONTEXT_TRACKING #ifdef HAVE_JUMP_LABEL - STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0 + STATIC_BRANCH_JMP l_yes=.Lafter_call_\@, key=context_tracking_enabled, branch=1 #endif call enter_from_user_mode .Lafter_call_\@: diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2767c625a52c..fbbf1ba57ec6 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -389,6 +389,13 @@ * that register for the time this macro runs */ + /* + * The high bits of the CS dword (__csh) are used for + * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case + * hardware didn't do this for us. + */ + andl $(0x0000ffff), PT_CS(%esp) + /* Are we on the entry stack? Bail out if not! */ movl PER_CPU_VAR(cpu_entry_area), %ecx addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx @@ -407,12 +414,6 @@ /* Load top of task-stack into %edi */ movl TSS_entry2task_stack(%edi), %edi - /* - * Clear unused upper bits of the dword containing the word-sized CS - * slot in pt_regs in case hardware didn't clear it for us. - */ - andl $(0x0000ffff), PT_CS(%esp) - /* Special case - entry from kernel mode via entry stack */ #ifdef CONFIG_VM86 movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 957dfb693ecc..f95dcb209fdf 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1187,6 +1187,16 @@ ENTRY(paranoid_entry) xorl %ebx, %ebx 1: + /* + * Always stash CR3 in %r14. This value will be restored, + * verbatim, at exit. Needed if paranoid_entry interrupted + * another entry that already switched to the user CR3 value + * but has not yet returned to userspace. + * + * This is also why CS (stashed in the "iret frame" by the + * hardware at entry) can not be used: this may be a return + * to kernel code, but with a user CR3 value. + */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 ret @@ -1211,11 +1221,13 @@ ENTRY(paranoid_exit) testl %ebx, %ebx /* swapgs needed? */ jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ + /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK jmp .Lparanoid_exit_restore .Lparanoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG + /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 .Lparanoid_exit_restore: jmp restore_regs_and_return_to_kernel @@ -1626,6 +1638,7 @@ end_repeat_nmi: movq $-1, %rsi call do_nmi + /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 testl %ebx, %ebx /* swapgs needed? */ diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index dfb2f7c0d019..de32741d041a 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1033,6 +1033,27 @@ static inline void x86_assign_hw_event(struct perf_event *event, } } +/** + * x86_perf_rdpmc_index - Return PMC counter used for event + * @event: the perf_event to which the PMC counter was assigned + * + * The counter assigned to this performance event may change if interrupts + * are enabled. This counter should thus never be used while interrupts are + * enabled. Before this function is used to obtain the assigned counter the + * event should be checked for validity using, for example, + * perf_event_read_local(), within the same interrupt disabled section in + * which this counter is planned to be used. + * + * Return: The index of the performance monitoring counter assigned to + * @perf_event. + */ +int x86_perf_rdpmc_index(struct perf_event *event) +{ + lockdep_assert_irqs_disabled(); + + return event->hw.event_base_rdpmc; +} + static inline int match_prev_assignment(struct hw_perf_event *hwc, struct cpu_hw_events *cpuc, int i) @@ -1584,7 +1605,7 @@ static void __init pmu_check_apic(void) } -static struct attribute_group x86_pmu_format_group = { +static struct attribute_group x86_pmu_format_group __ro_after_init = { .name = "format", .attrs = NULL, }; @@ -1631,9 +1652,9 @@ __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) struct attribute **new; int j, i; - for (j = 0; a[j]; j++) + for (j = 0; a && a[j]; j++) ; - for (i = 0; b[i]; i++) + for (i = 0; b && b[i]; i++) j++; j++; @@ -1642,9 +1663,9 @@ __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) return NULL; j = 0; - for (i = 0; a[i]; i++) + for (i = 0; a && a[i]; i++) new[j++] = a[i]; - for (i = 0; b[i]; i++) + for (i = 0; b && b[i]; i++) new[j++] = b[i]; new[j] = NULL; @@ -1715,7 +1736,7 @@ static struct attribute *events_attr[] = { NULL, }; -static struct attribute_group x86_pmu_events_group = { +static struct attribute_group x86_pmu_events_group __ro_after_init = { .name = "events", .attrs = events_attr, }; @@ -2230,7 +2251,7 @@ static struct attribute *x86_pmu_attrs[] = { NULL, }; -static struct attribute_group x86_pmu_attr_group = { +static struct attribute_group x86_pmu_attr_group __ro_after_init = { .attrs = x86_pmu_attrs, }; @@ -2248,7 +2269,7 @@ static struct attribute *x86_pmu_caps_attrs[] = { NULL }; -static struct attribute_group x86_pmu_caps_group = { +static struct attribute_group x86_pmu_caps_group __ro_after_init = { .name = "caps", .attrs = x86_pmu_caps_attrs, }; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 035c37481f57..0fb8659b20d8 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -242,7 +242,7 @@ EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); -static struct attribute *nhm_events_attrs[] = { +static struct attribute *nhm_mem_events_attrs[] = { EVENT_PTR(mem_ld_nhm), NULL, }; @@ -278,8 +278,6 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale, "4", "2"); static struct attribute *snb_events_attrs[] = { - EVENT_PTR(mem_ld_snb), - EVENT_PTR(mem_st_snb), EVENT_PTR(td_slots_issued), EVENT_PTR(td_slots_retired), EVENT_PTR(td_fetch_bubbles), @@ -290,6 +288,12 @@ static struct attribute *snb_events_attrs[] = { NULL, }; +static struct attribute *snb_mem_events_attrs[] = { + EVENT_PTR(mem_ld_snb), + EVENT_PTR(mem_st_snb), + NULL, +}; + static struct event_constraint intel_hsw_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -1995,6 +1999,18 @@ static void intel_pmu_nhm_enable_all(int added) intel_pmu_enable_all(added); } +static void enable_counter_freeze(void) +{ + update_debugctlmsr(get_debugctlmsr() | + DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI); +} + +static void disable_counter_freeze(void) +{ + update_debugctlmsr(get_debugctlmsr() & + ~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI); +} + static inline u64 intel_pmu_get_status(void) { u64 status; @@ -2200,59 +2216,15 @@ static void intel_pmu_reset(void) local_irq_restore(flags); } -/* - * This handler is triggered by the local APIC, so the APIC IRQ handling - * rules apply: - */ -static int intel_pmu_handle_irq(struct pt_regs *regs) +static int handle_pmi_common(struct pt_regs *regs, u64 status) { struct perf_sample_data data; - struct cpu_hw_events *cpuc; - int bit, loops; - u64 status; - int handled; - int pmu_enabled; - - cpuc = this_cpu_ptr(&cpu_hw_events); - - /* - * Save the PMU state. - * It needs to be restored when leaving the handler. - */ - pmu_enabled = cpuc->enabled; - /* - * No known reason to not always do late ACK, - * but just in case do it opt-in. - */ - if (!x86_pmu.late_ack) - apic_write(APIC_LVTPC, APIC_DM_NMI); - intel_bts_disable_local(); - cpuc->enabled = 0; - __intel_pmu_disable_all(); - handled = intel_pmu_drain_bts_buffer(); - handled += intel_bts_interrupt(); - status = intel_pmu_get_status(); - if (!status) - goto done; - - loops = 0; -again: - intel_pmu_lbr_read(); - intel_pmu_ack_status(status); - if (++loops > 100) { - static bool warned = false; - if (!warned) { - WARN(1, "perfevents: irq loop stuck!\n"); - perf_event_print_debug(); - warned = true; - } - intel_pmu_reset(); - goto done; - } + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int bit; + int handled = 0; inc_irq_stat(apic_perf_irqs); - /* * Ignore a range of extra bits in status that do not indicate * overflow by themselves. @@ -2261,7 +2233,7 @@ again: GLOBAL_STATUS_ASIF | GLOBAL_STATUS_LBRS_FROZEN); if (!status) - goto done; + return 0; /* * In case multiple PEBS events are sampled at the same time, * it is possible to have GLOBAL_STATUS bit 62 set indicating @@ -2331,6 +2303,146 @@ again: x86_pmu_stop(event, 0); } + return handled; +} + +static bool disable_counter_freezing; +static int __init intel_perf_counter_freezing_setup(char *s) +{ + disable_counter_freezing = true; + pr_info("Intel PMU Counter freezing feature disabled\n"); + return 1; +} +__setup("disable_counter_freezing", intel_perf_counter_freezing_setup); + +/* + * Simplified handler for Arch Perfmon v4: + * - We rely on counter freezing/unfreezing to enable/disable the PMU. + * This is done automatically on PMU ack. + * - Ack the PMU only after the APIC. + */ + +static int intel_pmu_handle_irq_v4(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int handled = 0; + bool bts = false; + u64 status; + int pmu_enabled = cpuc->enabled; + int loops = 0; + + /* PMU has been disabled because of counter freezing */ + cpuc->enabled = 0; + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + bts = true; + intel_bts_disable_local(); + handled = intel_pmu_drain_bts_buffer(); + handled += intel_bts_interrupt(); + } + status = intel_pmu_get_status(); + if (!status) + goto done; +again: + intel_pmu_lbr_read(); + if (++loops > 100) { + static bool warned; + + if (!warned) { + WARN(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + warned = true; + } + intel_pmu_reset(); + goto done; + } + + + handled += handle_pmi_common(regs, status); +done: + /* Ack the PMI in the APIC */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + + /* + * The counters start counting immediately while ack the status. + * Make it as close as possible to IRET. This avoids bogus + * freezing on Skylake CPUs. + */ + if (status) { + intel_pmu_ack_status(status); + } else { + /* + * CPU may issues two PMIs very close to each other. + * When the PMI handler services the first one, the + * GLOBAL_STATUS is already updated to reflect both. + * When it IRETs, the second PMI is immediately + * handled and it sees clear status. At the meantime, + * there may be a third PMI, because the freezing bit + * isn't set since the ack in first PMI handlers. + * Double check if there is more work to be done. + */ + status = intel_pmu_get_status(); + if (status) + goto again; + } + + if (bts) + intel_bts_enable_local(); + cpuc->enabled = pmu_enabled; + return handled; +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc; + int loops; + u64 status; + int handled; + int pmu_enabled; + + cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * Save the PMU state. + * It needs to be restored when leaving the handler. + */ + pmu_enabled = cpuc->enabled; + /* + * No known reason to not always do late ACK, + * but just in case do it opt-in. + */ + if (!x86_pmu.late_ack) + apic_write(APIC_LVTPC, APIC_DM_NMI); + intel_bts_disable_local(); + cpuc->enabled = 0; + __intel_pmu_disable_all(); + handled = intel_pmu_drain_bts_buffer(); + handled += intel_bts_interrupt(); + status = intel_pmu_get_status(); + if (!status) + goto done; + + loops = 0; +again: + intel_pmu_lbr_read(); + intel_pmu_ack_status(status); + if (++loops > 100) { + static bool warned; + + if (!warned) { + WARN(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + warned = true; + } + intel_pmu_reset(); + goto done; + } + + handled += handle_pmi_common(regs, status); + /* * Repeat if there is more work to be done: */ @@ -3350,6 +3462,9 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.version > 1) flip_smm_bit(&x86_pmu.attr_freeze_on_smi); + if (x86_pmu.counter_freezing) + enable_counter_freeze(); + if (!cpuc->shared_regs) return; @@ -3421,6 +3536,9 @@ static void intel_pmu_cpu_dying(int cpu) free_excl_cntrs(cpu); fini_debug_store_on_cpu(cpu); + + if (x86_pmu.counter_freezing) + disable_counter_freeze(); } static void intel_pmu_sched_task(struct perf_event_context *ctx, @@ -3725,6 +3843,40 @@ static __init void intel_nehalem_quirk(void) } } +static bool intel_glp_counter_freezing_broken(int cpu) +{ + u32 rev = UINT_MAX; /* default to broken for unknown stepping */ + + switch (cpu_data(cpu).x86_stepping) { + case 1: + rev = 0x28; + break; + case 8: + rev = 0x6; + break; + } + + return (cpu_data(cpu).microcode < rev); +} + +static __init void intel_glp_counter_freezing_quirk(void) +{ + /* Check if it's already disabled */ + if (disable_counter_freezing) + return; + + /* + * If the system starts with the wrong ucode, leave the + * counter-freezing feature permanently disabled. + */ + if (intel_glp_counter_freezing_broken(raw_smp_processor_id())) { + pr_info("PMU counter freezing disabled due to CPU errata," + "please upgrade microcode\n"); + x86_pmu.counter_freezing = false; + x86_pmu.handle_irq = intel_pmu_handle_irq; + } +} + /* * enable software workaround for errata: * SNB: BJ122 @@ -3764,8 +3916,6 @@ EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1"); EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1"); static struct attribute *hsw_events_attrs[] = { - EVENT_PTR(mem_ld_hsw), - EVENT_PTR(mem_st_hsw), EVENT_PTR(td_slots_issued), EVENT_PTR(td_slots_retired), EVENT_PTR(td_fetch_bubbles), @@ -3776,6 +3926,12 @@ static struct attribute *hsw_events_attrs[] = { NULL }; +static struct attribute *hsw_mem_events_attrs[] = { + EVENT_PTR(mem_ld_hsw), + EVENT_PTR(mem_st_hsw), + NULL, +}; + static struct attribute *hsw_tsx_events_attrs[] = { EVENT_PTR(tx_start), EVENT_PTR(tx_commit), @@ -3792,13 +3948,6 @@ static struct attribute *hsw_tsx_events_attrs[] = { NULL }; -static __init struct attribute **get_hsw_events_attrs(void) -{ - return boot_cpu_has(X86_FEATURE_RTM) ? - merge_attr(hsw_events_attrs, hsw_tsx_events_attrs) : - hsw_events_attrs; -} - static ssize_t freeze_on_smi_show(struct device *cdev, struct device_attribute *attr, char *buf) @@ -3875,9 +4024,32 @@ static struct attribute *intel_pmu_attrs[] = { NULL, }; +static __init struct attribute ** +get_events_attrs(struct attribute **base, + struct attribute **mem, + struct attribute **tsx) +{ + struct attribute **attrs = base; + struct attribute **old; + + if (mem && x86_pmu.pebs) + attrs = merge_attr(attrs, mem); + + if (tsx && boot_cpu_has(X86_FEATURE_RTM)) { + old = attrs; + attrs = merge_attr(attrs, tsx); + if (old != base) + kfree(old); + } + + return attrs; +} + __init int intel_pmu_init(void) { struct attribute **extra_attr = NULL; + struct attribute **mem_attr = NULL; + struct attribute **tsx_attr = NULL; struct attribute **to_free = NULL; union cpuid10_edx edx; union cpuid10_eax eax; @@ -3935,6 +4107,9 @@ __init int intel_pmu_init(void) max((int)edx.split.num_counters_fixed, assume); } + if (version >= 4) + x86_pmu.counter_freezing = !disable_counter_freezing; + if (boot_cpu_has(X86_FEATURE_PDCM)) { u64 capabilities; @@ -3986,7 +4161,7 @@ __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; - x86_pmu.cpu_events = nhm_events_attrs; + mem_attr = nhm_mem_events_attrs; /* UOPS_ISSUED.STALLED_CYCLES */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = @@ -4004,11 +4179,11 @@ __init int intel_pmu_init(void) name = "nehalem"; break; - case INTEL_FAM6_ATOM_PINEVIEW: - case INTEL_FAM6_ATOM_LINCROFT: - case INTEL_FAM6_ATOM_PENWELL: - case INTEL_FAM6_ATOM_CLOVERVIEW: - case INTEL_FAM6_ATOM_CEDARVIEW: + case INTEL_FAM6_ATOM_BONNELL: + case INTEL_FAM6_ATOM_BONNELL_MID: + case INTEL_FAM6_ATOM_SALTWELL: + case INTEL_FAM6_ATOM_SALTWELL_MID: + case INTEL_FAM6_ATOM_SALTWELL_TABLET: memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -4021,9 +4196,11 @@ __init int intel_pmu_init(void) name = "bonnell"; break; - case INTEL_FAM6_ATOM_SILVERMONT1: - case INTEL_FAM6_ATOM_SILVERMONT2: + case INTEL_FAM6_ATOM_SILVERMONT: + case INTEL_FAM6_ATOM_SILVERMONT_X: + case INTEL_FAM6_ATOM_SILVERMONT_MID: case INTEL_FAM6_ATOM_AIRMONT: + case INTEL_FAM6_ATOM_AIRMONT_MID: memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, @@ -4042,7 +4219,7 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_DENVERTON: + case INTEL_FAM6_ATOM_GOLDMONT_X: memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs, @@ -4068,7 +4245,8 @@ __init int intel_pmu_init(void) name = "goldmont"; break; - case INTEL_FAM6_ATOM_GEMINI_LAKE: + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + x86_add_quirk(intel_glp_counter_freezing_quirk); memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs, @@ -4112,7 +4290,7 @@ __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_westmere_extra_regs; x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.cpu_events = nhm_events_attrs; + mem_attr = nhm_mem_events_attrs; /* UOPS_ISSUED.STALLED_CYCLES */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = @@ -4152,6 +4330,7 @@ __init int intel_pmu_init(void) x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.cpu_events = snb_events_attrs; + mem_attr = snb_mem_events_attrs; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = @@ -4192,6 +4371,7 @@ __init int intel_pmu_init(void) x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.cpu_events = snb_events_attrs; + mem_attr = snb_mem_events_attrs; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = @@ -4226,10 +4406,12 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.cpu_events = get_hsw_events_attrs(); + x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.lbr_double_abort = true; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; + mem_attr = hsw_mem_events_attrs; + tsx_attr = hsw_tsx_events_attrs; pr_cont("Haswell events, "); name = "haswell"; break; @@ -4265,10 +4447,12 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.cpu_events = get_hsw_events_attrs(); + x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.limit_period = bdw_limit_period; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; + mem_attr = hsw_mem_events_attrs; + tsx_attr = hsw_tsx_events_attrs; pr_cont("Broadwell events, "); name = "broadwell"; break; @@ -4324,7 +4508,9 @@ __init int intel_pmu_init(void) hsw_format_attr : nhm_format_attr; extra_attr = merge_attr(extra_attr, skl_format_attr); to_free = extra_attr; - x86_pmu.cpu_events = get_hsw_events_attrs(); + x86_pmu.cpu_events = hsw_events_attrs; + mem_attr = hsw_mem_events_attrs; + tsx_attr = hsw_tsx_events_attrs; intel_pmu_pebs_data_source_skl( boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); pr_cont("Skylake events, "); @@ -4357,6 +4543,9 @@ __init int intel_pmu_init(void) WARN_ON(!x86_pmu.format_attrs); } + x86_pmu.cpu_events = get_events_attrs(x86_pmu.cpu_events, + mem_attr, tsx_attr); + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); @@ -4431,6 +4620,13 @@ __init int intel_pmu_init(void) pr_cont("full-width counters, "); } + /* + * For arch perfmon 4 use counter freezing to avoid + * several MSR accesses in the PMI. + */ + if (x86_pmu.counter_freezing) + x86_pmu.handle_irq = intel_pmu_handle_irq_v4; + kfree(to_free); return 0; } diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 9f8084f18d58..d2e780705c5a 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -559,8 +559,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT1, slm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT2, slm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_X, slm_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates), X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates), @@ -581,9 +581,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_DENVERTON, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_X, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GEMINI_LAKE, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 8d016ce5b80d..3a0aa83cbd07 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -95,7 +95,7 @@ static ssize_t pt_cap_show(struct device *cdev, return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap)); } -static struct attribute_group pt_cap_group = { +static struct attribute_group pt_cap_group __ro_after_init = { .name = "caps", }; diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 32f3e9423e99..91039ffed633 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -777,9 +777,9 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_MOBILE, skl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_DENVERTON, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GEMINI_LAKE, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, hsw_rapl_init), {}, }; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index b4771a6ddbc1..1b9f85abf9bc 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -69,14 +69,14 @@ static bool test_intel(int idx) case INTEL_FAM6_BROADWELL_GT3E: case INTEL_FAM6_BROADWELL_X: - case INTEL_FAM6_ATOM_SILVERMONT1: - case INTEL_FAM6_ATOM_SILVERMONT2: + case INTEL_FAM6_ATOM_SILVERMONT: + case INTEL_FAM6_ATOM_SILVERMONT_X: case INTEL_FAM6_ATOM_AIRMONT: case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_DENVERTON: + case INTEL_FAM6_ATOM_GOLDMONT_X: - case INTEL_FAM6_ATOM_GEMINI_LAKE: + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM: diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 156286335351..adae087cecdd 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -560,9 +560,11 @@ struct x86_pmu { struct event_constraint *event_constraints; struct x86_pmu_quirk *quirks; int perfctr_second_write; - bool late_ack; u64 (*limit_period)(struct perf_event *event, u64 l); + /* PMI handler bits */ + unsigned int late_ack :1, + counter_freezing :1; /* * sysfs attrs */ diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 31b627b43a8e..8e4ea39e55d0 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -7,16 +7,24 @@ #include <asm/asm.h> #ifdef CONFIG_SMP - .macro LOCK_PREFIX -672: lock +.macro LOCK_PREFIX_HERE .pushsection .smp_locks,"a" .balign 4 - .long 672b - . + .long 671f - . # offset .popsection - .endm +671: +.endm + +.macro LOCK_PREFIX insn:vararg + LOCK_PREFIX_HERE + lock \insn +.endm #else - .macro LOCK_PREFIX - .endm +.macro LOCK_PREFIX_HERE +.endm + +.macro LOCK_PREFIX insn:vararg +.endm #endif /* diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4cd6a3b71824..d7faa16622d8 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -31,15 +31,8 @@ */ #ifdef CONFIG_SMP -#define LOCK_PREFIX_HERE \ - ".pushsection .smp_locks,\"a\"\n" \ - ".balign 4\n" \ - ".long 671f - .\n" /* offset */ \ - ".popsection\n" \ - "671:" - -#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; " - +#define LOCK_PREFIX_HERE "LOCK_PREFIX_HERE\n\t" +#define LOCK_PREFIX "LOCK_PREFIX " #else /* ! CONFIG_SMP */ #define LOCK_PREFIX_HERE "" #define LOCK_PREFIX "" diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 990770f9e76b..21b086786404 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -120,16 +120,32 @@ /* Exception table entry */ #ifdef __ASSEMBLY__ # define _ASM_EXTABLE_HANDLE(from, to, handler) \ - .pushsection "__ex_table","a" ; \ - .balign 4 ; \ - .long (from) - . ; \ - .long (to) - . ; \ - .long (handler) - . ; \ + ASM_EXTABLE_HANDLE from to handler + +.macro ASM_EXTABLE_HANDLE from:req to:req handler:req + .pushsection "__ex_table","a" + .balign 4 + .long (\from) - . + .long (\to) - . + .long (\handler) - . .popsection +.endm +#else /* __ASSEMBLY__ */ + +# define _ASM_EXTABLE_HANDLE(from, to, handler) \ + "ASM_EXTABLE_HANDLE from=" #from " to=" #to \ + " handler=\"" #handler "\"\n\t" + +/* For C file, we already have NOKPROBE_SYMBOL macro */ + +#endif /* __ASSEMBLY__ */ # define _ASM_EXTABLE(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) +# define _ASM_EXTABLE_UA(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) + # define _ASM_EXTABLE_FAULT(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) @@ -145,6 +161,7 @@ _ASM_PTR (entry); \ .popsection +#ifdef __ASSEMBLY__ .macro ALIGN_DESTINATION /* check for bad alignment of destination */ movl %edi,%ecx @@ -165,34 +182,10 @@ jmp copy_user_handle_tail .previous - _ASM_EXTABLE(100b,103b) - _ASM_EXTABLE(101b,103b) + _ASM_EXTABLE_UA(100b, 103b) + _ASM_EXTABLE_UA(101b, 103b) .endm - -#else -# define _EXPAND_EXTABLE_HANDLE(x) #x -# define _ASM_EXTABLE_HANDLE(from, to, handler) \ - " .pushsection \"__ex_table\",\"a\"\n" \ - " .balign 4\n" \ - " .long (" #from ") - .\n" \ - " .long (" #to ") - .\n" \ - " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n" \ - " .popsection\n" - -# define _ASM_EXTABLE(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) - -# define _ASM_EXTABLE_FAULT(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) - -# define _ASM_EXTABLE_EX(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) - -# define _ASM_EXTABLE_REFCOUNT(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) - -/* For C file, we already have NOKPROBE_SYMBOL macro */ -#endif +#endif /* __ASSEMBLY__ */ #ifndef __ASSEMBLY__ /* diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index ce84388e540c..ea3d95275b43 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -82,7 +82,7 @@ static __always_inline void arch_atomic_sub(int i, atomic_t *v) */ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e); + return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i); } #define arch_atomic_sub_and_test arch_atomic_sub_and_test @@ -122,7 +122,7 @@ static __always_inline void arch_atomic_dec(atomic_t *v) */ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e); } #define arch_atomic_dec_and_test arch_atomic_dec_and_test @@ -136,7 +136,7 @@ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) */ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e); } #define arch_atomic_inc_and_test arch_atomic_inc_and_test @@ -151,7 +151,7 @@ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) */ static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s); + return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i); } #define arch_atomic_add_negative arch_atomic_add_negative diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 5f851d92eecd..dadc20adba21 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -73,7 +73,7 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v) */ static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e); + return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i); } #define arch_atomic64_sub_and_test arch_atomic64_sub_and_test @@ -115,7 +115,7 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v) */ static inline bool arch_atomic64_dec_and_test(atomic64_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e); } #define arch_atomic64_dec_and_test arch_atomic64_dec_and_test @@ -129,7 +129,7 @@ static inline bool arch_atomic64_dec_and_test(atomic64_t *v) */ static inline bool arch_atomic64_inc_and_test(atomic64_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e); } #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test @@ -144,7 +144,7 @@ static inline bool arch_atomic64_inc_and_test(atomic64_t *v) */ static inline bool arch_atomic64_add_negative(long i, atomic64_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s); + return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i); } #define arch_atomic64_add_negative arch_atomic64_add_negative diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 9f645ba57dbb..124f9195eb3e 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -217,8 +217,7 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr) */ static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), - *addr, "Ir", nr, "%0", c); + return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr); } /** @@ -264,8 +263,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * */ static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), - *addr, "Ir", nr, "%0", c); + return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr); } /** @@ -318,8 +316,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon */ static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), - *addr, "Ir", nr, "%0", c); + return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr); } static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr) diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 6804d6642767..5090035e6d16 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -4,6 +4,8 @@ #include <linux/stringify.h> +#ifndef __ASSEMBLY__ + /* * Despite that some emulators terminate on UD2, we use it for WARN(). * @@ -20,53 +22,15 @@ #define LEN_UD2 2 -#ifdef CONFIG_GENERIC_BUG - -#ifdef CONFIG_X86_32 -# define __BUG_REL(val) ".long " __stringify(val) -#else -# define __BUG_REL(val) ".long " __stringify(val) " - 2b" -#endif - -#ifdef CONFIG_DEBUG_BUGVERBOSE - -#define _BUG_FLAGS(ins, flags) \ -do { \ - asm volatile("1:\t" ins "\n" \ - ".pushsection __bug_table,\"aw\"\n" \ - "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ - "\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \ - "\t.word %c1" "\t# bug_entry::line\n" \ - "\t.word %c2" "\t# bug_entry::flags\n" \ - "\t.org 2b+%c3\n" \ - ".popsection" \ - : : "i" (__FILE__), "i" (__LINE__), \ - "i" (flags), \ - "i" (sizeof(struct bug_entry))); \ -} while (0) - -#else /* !CONFIG_DEBUG_BUGVERBOSE */ - #define _BUG_FLAGS(ins, flags) \ do { \ - asm volatile("1:\t" ins "\n" \ - ".pushsection __bug_table,\"aw\"\n" \ - "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ - "\t.word %c0" "\t# bug_entry::flags\n" \ - "\t.org 2b+%c1\n" \ - ".popsection" \ - : : "i" (flags), \ + asm volatile("ASM_BUG ins=\"" ins "\" file=%c0 line=%c1 " \ + "flags=%c2 size=%c3" \ + : : "i" (__FILE__), "i" (__LINE__), \ + "i" (flags), \ "i" (sizeof(struct bug_entry))); \ } while (0) -#endif /* CONFIG_DEBUG_BUGVERBOSE */ - -#else - -#define _BUG_FLAGS(ins, flags) asm volatile(ins) - -#endif /* CONFIG_GENERIC_BUG */ - #define HAVE_ARCH_BUG #define BUG() \ do { \ @@ -82,4 +46,54 @@ do { \ #include <asm-generic/bug.h> +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_GENERIC_BUG + +#ifdef CONFIG_X86_32 +.macro __BUG_REL val:req + .long \val +.endm +#else +.macro __BUG_REL val:req + .long \val - 2b +.endm +#endif + +#ifdef CONFIG_DEBUG_BUGVERBOSE + +.macro ASM_BUG ins:req file:req line:req flags:req size:req +1: \ins + .pushsection __bug_table,"aw" +2: __BUG_REL val=1b # bug_entry::bug_addr + __BUG_REL val=\file # bug_entry::file + .word \line # bug_entry::line + .word \flags # bug_entry::flags + .org 2b+\size + .popsection +.endm + +#else /* !CONFIG_DEBUG_BUGVERBOSE */ + +.macro ASM_BUG ins:req file:req line:req flags:req size:req +1: \ins + .pushsection __bug_table,"aw" +2: __BUG_REL val=1b # bug_entry::bug_addr + .word \flags # bug_entry::flags + .org 2b+\size + .popsection +.endm + +#endif /* CONFIG_DEBUG_BUGVERBOSE */ + +#else /* CONFIG_GENERIC_BUG */ + +.macro ASM_BUG ins:req file:req line:req flags:req size:req + \ins +.endm + +#endif /* CONFIG_GENERIC_BUG */ + +#endif /* __ASSEMBLY__ */ + #endif /* _ASM_X86_BUG_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index aced6c9290d6..7d442722ef24 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -2,10 +2,10 @@ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H -#include <asm/processor.h> - -#if defined(__KERNEL__) && !defined(__ASSEMBLY__) +#ifdef __KERNEL__ +#ifndef __ASSEMBLY__ +#include <asm/processor.h> #include <asm/asm.h> #include <linux/bitops.h> @@ -161,37 +161,10 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); */ static __always_inline __pure bool _static_cpu_has(u16 bit) { - asm_volatile_goto("1: jmp 6f\n" - "2:\n" - ".skip -(((5f-4f) - (2b-1b)) > 0) * " - "((5f-4f) - (2b-1b)),0x90\n" - "3:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 4f - .\n" /* repl offset */ - " .word %P[always]\n" /* always replace */ - " .byte 3b - 1b\n" /* src len */ - " .byte 5f - 4f\n" /* repl len */ - " .byte 3b - 2b\n" /* pad len */ - ".previous\n" - ".section .altinstr_replacement,\"ax\"\n" - "4: jmp %l[t_no]\n" - "5:\n" - ".previous\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 0\n" /* no replacement */ - " .word %P[feature]\n" /* feature bit */ - " .byte 3b - 1b\n" /* src len */ - " .byte 0\n" /* repl len */ - " .byte 0\n" /* pad len */ - ".previous\n" - ".section .altinstr_aux,\"ax\"\n" - "6:\n" - " testb %[bitnum],%[cap_byte]\n" - " jnz %l[t_yes]\n" - " jmp %l[t_no]\n" - ".previous\n" + asm_volatile_goto("STATIC_CPU_HAS bitnum=%[bitnum] " + "cap_byte=\"%[cap_byte]\" " + "feature=%P[feature] t_yes=%l[t_yes] " + "t_no=%l[t_no] always=%P[always]" : : [feature] "i" (bit), [always] "i" (X86_FEATURE_ALWAYS), [bitnum] "i" (1 << (bit & 7)), @@ -226,5 +199,44 @@ t_no: #define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ boot_cpu_data.x86_model -#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ +#else /* __ASSEMBLY__ */ + +.macro STATIC_CPU_HAS bitnum:req cap_byte:req feature:req t_yes:req t_no:req always:req +1: + jmp 6f +2: + .skip -(((5f-4f) - (2b-1b)) > 0) * ((5f-4f) - (2b-1b)),0x90 +3: + .section .altinstructions,"a" + .long 1b - . /* src offset */ + .long 4f - . /* repl offset */ + .word \always /* always replace */ + .byte 3b - 1b /* src len */ + .byte 5f - 4f /* repl len */ + .byte 3b - 2b /* pad len */ + .previous + .section .altinstr_replacement,"ax" +4: + jmp \t_no +5: + .previous + .section .altinstructions,"a" + .long 1b - . /* src offset */ + .long 0 /* no replacement */ + .word \feature /* feature bit */ + .byte 3b - 1b /* src len */ + .byte 0 /* repl len */ + .byte 0 /* pad len */ + .previous + .section .altinstr_aux,"ax" +6: + testb \bitnum,\cap_byte + jnz \t_yes + jmp \t_no + .previous +.endm + +#endif /* __ASSEMBLY__ */ + +#endif /* __KERNEL__ */ #endif /* _ASM_X86_CPUFEATURE_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index cec5fae23eb3..eea40d52ca78 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -140,6 +140,7 @@ extern void __init efi_apply_memmap_quirks(void); extern int __init efi_reuse_config(u64 tables, int nr_tables); extern void efi_delete_dummy_variable(void); extern void efi_switch_mm(struct mm_struct *mm); +extern void efi_recover_from_page_fault(unsigned long phys_addr); struct efi_setup_data { u64 fw_vendor; diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 1527ec351036..69c0f892e310 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -63,8 +63,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t; #define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */ #define R_X86_64_8 14 /* Direct 8 bit sign extended */ #define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */ - -#define R_X86_64_NUM 16 +#define R_X86_64_PC64 24 /* Place relative 64-bit signed */ /* * These are used to set parameters in the core dumps. diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h index f9c3a5d502f4..d8c2198d543b 100644 --- a/arch/x86/include/asm/extable.h +++ b/arch/x86/include/asm/extable.h @@ -29,7 +29,8 @@ struct pt_regs; (b)->handler = (tmp).handler - (delta); \ } while (0) -extern int fixup_exception(struct pt_regs *regs, int trapnr); +extern int fixup_exception(struct pt_regs *regs, int trapnr, + unsigned long error_code, unsigned long fault_addr); extern int fixup_bug(struct pt_regs *regs, int trapnr); extern bool ex_has_fault_handler(unsigned long ip); extern void early_fixup_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index a38bf5a1e37a..5f7290e6e954 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -226,7 +226,7 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) "3: movl $-2,%[err]\n\t" \ "jmp 2b\n\t" \ ".popsection\n\t" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : [err] "=r" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") @@ -528,7 +528,7 @@ static inline void fpregs_activate(struct fpu *fpu) static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - if (old_fpu->initialized) { + if (static_cpu_has(X86_FEATURE_FPU) && old_fpu->initialized) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index de4d68852d3a..13c83fe97988 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -20,7 +20,7 @@ "3:\tmov\t%3, %1\n" \ "\tjmp\t2b\n" \ "\t.previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \ : "i" (-EFAULT), "0" (oparg), "1" (0)) @@ -36,8 +36,8 @@ "4:\tmov\t%5, %1\n" \ "\tjmp\t3b\n" \ "\t.previous\n" \ - _ASM_EXTABLE(1b, 4b) \ - _ASM_EXTABLE(2b, 4b) \ + _ASM_EXTABLE_UA(1b, 4b) \ + _ASM_EXTABLE_UA(2b, 4b) \ : "=&a" (oldval), "=&r" (ret), \ "+m" (*uaddr), "=&r" (tem) \ : "r" (oparg), "i" (-EFAULT), "1" (0)) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 7ed08a7c3398..0dd6b0f4000e 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -8,9 +8,6 @@ * The "_X" parts are generally the EP and EX Xeons, or the * "Extreme" ones, like Broadwell-E. * - * Things ending in "2" are usually because we have no better - * name for them. There's no processor called "SILVERMONT2". - * * While adding a new CPUID for a new microarchitecture, add a new * group to keep logically sorted out in chronological order. Within * that group keep the CPUID for the variants sorted by model number. @@ -57,19 +54,23 @@ /* "Small Core" Processors (Atom) */ -#define INTEL_FAM6_ATOM_PINEVIEW 0x1C -#define INTEL_FAM6_ATOM_LINCROFT 0x26 -#define INTEL_FAM6_ATOM_PENWELL 0x27 -#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35 -#define INTEL_FAM6_ATOM_CEDARVIEW 0x36 -#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */ -#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ -#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ -#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ -#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */ -#define INTEL_FAM6_ATOM_GOLDMONT 0x5C -#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ -#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A +#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ +#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ + +#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */ +#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */ +#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */ + +#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */ +#define INTEL_FAM6_ATOM_SILVERMONT_X 0x4D /* Avaton, Rangely */ +#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */ + +#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */ +#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */ + +#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ +#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */ +#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ /* Xeon Phi */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 6de64840dd22..9a92a3ac2ac5 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -369,18 +369,6 @@ extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); extern bool is_early_ioremap_ptep(pte_t *ptep); -#ifdef CONFIG_XEN -#include <xen/xen.h> -struct bio_vec; - -extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, - const struct bio_vec *vec2); - -#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ - (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) -#endif /* CONFIG_XEN */ - #define IO_SPACE_LIMIT 0xffff #include <asm-generic/io.h> diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 8c0de4282659..a5fb34fe56a4 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -2,19 +2,6 @@ #ifndef _ASM_X86_JUMP_LABEL_H #define _ASM_X86_JUMP_LABEL_H -#ifndef HAVE_JUMP_LABEL -/* - * For better or for worse, if jump labels (the gcc extension) are missing, - * then the entire static branch patching infrastructure is compiled out. - * If that happens, the code in here will malfunction. Raise a compiler - * error instead. - * - * In theory, jump labels and the static branch patching infrastructure - * could be decoupled to fix this. - */ -#error asm/jump_label.h included on a non-jump-label kernel -#endif - #define JUMP_LABEL_NOP_SIZE 5 #ifdef CONFIG_X86_64 @@ -33,14 +20,9 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:" - ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t" - ".pushsection __jump_table, \"aw\" \n\t" - _ASM_ALIGN "\n\t" - _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t" - ".popsection \n\t" - : : "i" (key), "i" (branch) : : l_yes); - + asm_volatile_goto("STATIC_BRANCH_NOP l_yes=\"%l[l_yes]\" key=\"%c0\" " + "branch=\"%c1\"" + : : "i" (key), "i" (branch) : : l_yes); return false; l_yes: return true; @@ -48,13 +30,8 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:" - ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t" - "2:\n\t" - ".pushsection __jump_table, \"aw\" \n\t" - _ASM_ALIGN "\n\t" - _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t" - ".popsection \n\t" + asm_volatile_goto("STATIC_BRANCH_JMP l_yes=\"%l[l_yes]\" key=\"%c0\" " + "branch=\"%c1\"" : : "i" (key), "i" (branch) : : l_yes); return false; @@ -62,49 +39,28 @@ l_yes: return true; } -#ifdef CONFIG_X86_64 -typedef u64 jump_label_t; -#else -typedef u32 jump_label_t; -#endif - -struct jump_entry { - jump_label_t code; - jump_label_t target; - jump_label_t key; -}; - #else /* __ASSEMBLY__ */ -.macro STATIC_JUMP_IF_TRUE target, key, def -.Lstatic_jump_\@: - .if \def - /* Equivalent to "jmp.d32 \target" */ - .byte 0xe9 - .long \target - .Lstatic_jump_after_\@ -.Lstatic_jump_after_\@: - .else - .byte STATIC_KEY_INIT_NOP - .endif +.macro STATIC_BRANCH_NOP l_yes:req key:req branch:req +.Lstatic_branch_nop_\@: + .byte STATIC_KEY_INIT_NOP +.Lstatic_branch_no_after_\@: .pushsection __jump_table, "aw" _ASM_ALIGN - _ASM_PTR .Lstatic_jump_\@, \target, \key + .long .Lstatic_branch_nop_\@ - ., \l_yes - . + _ASM_PTR \key + \branch - . .popsection .endm -.macro STATIC_JUMP_IF_FALSE target, key, def -.Lstatic_jump_\@: - .if \def - .byte STATIC_KEY_INIT_NOP - .else - /* Equivalent to "jmp.d32 \target" */ - .byte 0xe9 - .long \target - .Lstatic_jump_after_\@ -.Lstatic_jump_after_\@: - .endif +.macro STATIC_BRANCH_JMP l_yes:req key:req branch:req +.Lstatic_branch_jmp_\@: + .byte 0xe9 + .long \l_yes - .Lstatic_branch_jmp_after_\@ +.Lstatic_branch_jmp_after_\@: .pushsection __jump_table, "aw" _ASM_ALIGN - _ASM_PTR .Lstatic_jump_\@, \target, \key + 1 + .long .Lstatic_branch_jmp_\@ - ., \l_yes - . + _ASM_PTR \key + \branch - . .popsection .endm diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index c91083c59845..349a47acaa4a 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -53,7 +53,7 @@ static inline void local_sub(long i, local_t *l) */ static inline bool local_sub_and_test(long i, local_t *l) { - GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, "er", i, "%0", e); + return GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, e, "er", i); } /** @@ -66,7 +66,7 @@ static inline bool local_sub_and_test(long i, local_t *l) */ static inline bool local_dec_and_test(local_t *l) { - GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", e); + return GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, e); } /** @@ -79,7 +79,7 @@ static inline bool local_dec_and_test(local_t *l) */ static inline bool local_inc_and_test(local_t *l) { - GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", e); + return GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, e); } /** @@ -93,7 +93,7 @@ static inline bool local_inc_and_test(local_t *l) */ static inline bool local_add_negative(long i, local_t *l) { - GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, "er", i, "%0", s); + return GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, s, "er", i); } /** diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 3a17107594c8..97d6969f9a8a 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -10,41 +10,44 @@ /* MCG_CAP register defines */ #define MCG_BANKCNT_MASK 0xff /* Number of Banks */ -#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ -#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ -#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ +#define MCG_CTL_P BIT_ULL(8) /* MCG_CTL register available */ +#define MCG_EXT_P BIT_ULL(9) /* Extended registers available */ +#define MCG_CMCI_P BIT_ULL(10) /* CMCI supported */ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ #define MCG_EXT_CNT_SHIFT 16 #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) -#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ -#define MCG_ELOG_P (1ULL<<26) /* Extended error log supported */ -#define MCG_LMCE_P (1ULL<<27) /* Local machine check supported */ +#define MCG_SER_P BIT_ULL(24) /* MCA recovery/new status bits */ +#define MCG_ELOG_P BIT_ULL(26) /* Extended error log supported */ +#define MCG_LMCE_P BIT_ULL(27) /* Local machine check supported */ /* MCG_STATUS register defines */ -#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ -#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ -#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ -#define MCG_STATUS_LMCES (1ULL<<3) /* LMCE signaled */ +#define MCG_STATUS_RIPV BIT_ULL(0) /* restart ip valid */ +#define MCG_STATUS_EIPV BIT_ULL(1) /* ip points to correct instruction */ +#define MCG_STATUS_MCIP BIT_ULL(2) /* machine check in progress */ +#define MCG_STATUS_LMCES BIT_ULL(3) /* LMCE signaled */ /* MCG_EXT_CTL register defines */ -#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */ +#define MCG_EXT_CTL_LMCE_EN BIT_ULL(0) /* Enable LMCE */ /* MCi_STATUS register defines */ -#define MCI_STATUS_VAL (1ULL<<63) /* valid error */ -#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */ -#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */ -#define MCI_STATUS_EN (1ULL<<60) /* error enabled */ -#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */ -#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */ -#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ -#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ -#define MCI_STATUS_AR (1ULL<<55) /* Action required */ +#define MCI_STATUS_VAL BIT_ULL(63) /* valid error */ +#define MCI_STATUS_OVER BIT_ULL(62) /* previous errors lost */ +#define MCI_STATUS_UC BIT_ULL(61) /* uncorrected error */ +#define MCI_STATUS_EN BIT_ULL(60) /* error enabled */ +#define MCI_STATUS_MISCV BIT_ULL(59) /* misc error reg. valid */ +#define MCI_STATUS_ADDRV BIT_ULL(58) /* addr reg. valid */ +#define MCI_STATUS_PCC BIT_ULL(57) /* processor context corrupt */ +#define MCI_STATUS_S BIT_ULL(56) /* Signaled machine check */ +#define MCI_STATUS_AR BIT_ULL(55) /* Action required */ +#define MCI_STATUS_CEC_SHIFT 38 /* Corrected Error Count */ +#define MCI_STATUS_CEC_MASK GENMASK_ULL(52,38) +#define MCI_STATUS_CEC(c) (((c) & MCI_STATUS_CEC_MASK) >> MCI_STATUS_CEC_SHIFT) /* AMD-specific bits */ -#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ -#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */ -#define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */ -#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ +#define MCI_STATUS_TCC BIT_ULL(55) /* Task context corrupt */ +#define MCI_STATUS_SYNDV BIT_ULL(53) /* synd reg. valid */ +#define MCI_STATUS_DEFERRED BIT_ULL(44) /* uncorrected error, deferred exception */ +#define MCI_STATUS_POISON BIT_ULL(43) /* access poisonous data */ /* * McaX field if set indicates a given bank supports MCA extensions: @@ -84,7 +87,7 @@ #define MCI_MISC_ADDR_GENERIC 7 /* generic */ /* CTL2 register defines */ -#define MCI_CTL2_CMCI_EN (1ULL << 30) +#define MCI_CTL2_CMCI_EN BIT_ULL(30) #define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL #define MCJ_CTX_MASK 3 diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 4731f0cf97c5..80f4a4f38c79 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -164,6 +164,7 @@ #define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9) #define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) +#define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12) #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14 #define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 4b75acc23b30..83ce282eed0a 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -346,23 +346,11 @@ extern struct pv_lock_ops pv_lock_ops; #define paravirt_clobber(clobber) \ [paravirt_clobber] "i" (clobber) -/* - * Generate some code, and mark it as patchable by the - * apply_paravirt() alternate instruction patcher. - */ -#define _paravirt_alt(insn_string, type, clobber) \ - "771:\n\t" insn_string "\n" "772:\n" \ - ".pushsection .parainstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR " 771b\n" \ - " .byte " type "\n" \ - " .byte 772b-771b\n" \ - " .short " clobber "\n" \ - ".popsection\n" - /* Generate patchable code, with the default asm parameters. */ -#define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") +#define paravirt_call \ + "PARAVIRT_CALL type=\"%c[paravirt_typenum]\"" \ + " clobber=\"%c[paravirt_clobber]\"" \ + " pv_opptr=\"%c[paravirt_opptr]\";" /* Simple instruction patching code. */ #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" @@ -391,16 +379,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, int paravirt_disable_iospace(void); /* - * This generates an indirect call based on the operation type number. - * The type number, computed in PARAVIRT_PATCH, is derived from the - * offset into the paravirt_patch_template structure, and can therefore be - * freely converted back into a structure offset. - */ -#define PARAVIRT_CALL \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%c[paravirt_opptr];" - -/* * These macros are intended to wrap calls through one of the paravirt * ops structs, so that they can be later identified and patched at * runtime. @@ -537,7 +515,7 @@ int paravirt_disable_iospace(void); /* since this condition will never hold */ \ if (sizeof(rettype) > sizeof(unsigned long)) { \ asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ + paravirt_call \ post \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ @@ -547,7 +525,7 @@ int paravirt_disable_iospace(void); __ret = (rettype)((((u64)__edx) << 32) | __eax); \ } else { \ asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ + paravirt_call \ post \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ @@ -574,7 +552,7 @@ int paravirt_disable_iospace(void); PVOP_VCALL_ARGS; \ PVOP_TEST_NULL(op); \ asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ + paravirt_call \ post \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ @@ -694,6 +672,26 @@ struct paravirt_patch_site { extern struct paravirt_patch_site __parainstructions[], __parainstructions_end[]; +#else /* __ASSEMBLY__ */ + +/* + * This generates an indirect call based on the operation type number. + * The type number, computed in PARAVIRT_PATCH, is derived from the + * offset into the paravirt_patch_template structure, and can therefore be + * freely converted back into a structure offset. + */ +.macro PARAVIRT_CALL type:req clobber:req pv_opptr:req +771: ANNOTATE_RETPOLINE_SAFE + call *\pv_opptr +772: .pushsection .parainstructions,"a" + _ASM_ALIGN + _ASM_PTR 771b + .byte \type + .byte 772b-771b + .short \clobber + .popsection +.endm + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PARAVIRT_TYPES_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e9202a0de8f0..1a19d11cfbbd 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -185,22 +185,22 @@ do { \ typeof(var) pfo_ret__; \ switch (sizeof(var)) { \ case 1: \ - asm(op "b "__percpu_arg(1)",%0" \ + asm volatile(op "b "__percpu_arg(1)",%0"\ : "=q" (pfo_ret__) \ : "m" (var)); \ break; \ case 2: \ - asm(op "w "__percpu_arg(1)",%0" \ + asm volatile(op "w "__percpu_arg(1)",%0"\ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 4: \ - asm(op "l "__percpu_arg(1)",%0" \ + asm volatile(op "l "__percpu_arg(1)",%0"\ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 8: \ - asm(op "q "__percpu_arg(1)",%0" \ + asm volatile(op "q "__percpu_arg(1)",%0"\ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 78241b736f2a..8bdf74902293 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -278,6 +278,7 @@ struct perf_guest_switch_msr { extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); extern void perf_check_microcode(void); +extern int x86_perf_rdpmc_index(struct perf_event *event); #else static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) { diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index b64acb08a62b..106b7d0e2dae 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -124,7 +124,7 @@ */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY) + _PAGE_SOFT_DIRTY | _PAGE_DEVMAP) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) /* diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 7f2dbd91fc74..90cb2f36c042 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -88,7 +88,7 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e); + return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); } /* diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 6de1fd3d0097..25f49af1b13c 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -37,8 +37,10 @@ struct pt_regs { unsigned short __esh; unsigned short fs; unsigned short __fsh; + /* On interrupt, gs and __gsh store the vector number. */ unsigned short gs; unsigned short __gsh; + /* On interrupt, this is the error code. */ unsigned long orig_ax; unsigned long ip; unsigned short cs; @@ -237,23 +239,51 @@ static inline int regs_within_kernel_stack(struct pt_regs *regs, } /** + * regs_get_kernel_stack_nth_addr() - get the address of the Nth entry on stack + * @regs: pt_regs which contains kernel stack pointer. + * @n: stack entry number. + * + * regs_get_kernel_stack_nth() returns the address of the @n th entry of the + * kernel stack which is specified by @regs. If the @n th entry is NOT in + * the kernel stack, this returns NULL. + */ +static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n) +{ + unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); + + addr += n; + if (regs_within_kernel_stack(regs, (unsigned long)addr)) + return addr; + else + return NULL; +} + +/* To avoid include hell, we can't include uaccess.h */ +extern long probe_kernel_read(void *dst, const void *src, size_t size); + +/** * regs_get_kernel_stack_nth() - get Nth entry of the stack * @regs: pt_regs which contains kernel stack pointer. * @n: stack entry number. * * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which - * is specified by @regs. If the @n th entry is NOT in the kernel stack, + * is specified by @regs. If the @n th entry is NOT in the kernel stack * this returns 0. */ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) { - unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); - addr += n; - if (regs_within_kernel_stack(regs, (unsigned long)addr)) - return *addr; - else - return 0; + unsigned long *addr; + unsigned long val; + long ret; + + addr = regs_get_kernel_stack_nth_addr(regs, n); + if (addr) { + ret = probe_kernel_read(&val, addr, sizeof(val)); + if (!ret) + return val; + } + return 0; } #define arch_has_single_step() (1) diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 3e70bed8a978..87623c6b13db 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -6,9 +6,24 @@ #include <asm/cpufeature.h> #include <asm-generic/qspinlock_types.h> #include <asm/paravirt.h> +#include <asm/rmwcc.h> #define _Q_PENDING_LOOPS (1 << 9) +#define queued_fetch_set_pending_acquire queued_fetch_set_pending_acquire +static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) +{ + u32 val = 0; + + if (GEN_BINARY_RMWcc(LOCK_PREFIX "btsl", lock->val.counter, c, + "I", _Q_PENDING_OFFSET)) + val |= _Q_PENDING_VAL; + + val |= atomic_read(&lock->val) & ~_Q_PENDING_MASK; + + return val; +} + #ifdef CONFIG_PARAVIRT_SPINLOCKS extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __pv_init_lock_hash(void); diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h index 19b90521954c..a8b5e1e13319 100644 --- a/arch/x86/include/asm/refcount.h +++ b/arch/x86/include/asm/refcount.h @@ -4,6 +4,41 @@ * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from * PaX/grsecurity. */ + +#ifdef __ASSEMBLY__ + +#include <asm/asm.h> +#include <asm/bug.h> + +.macro REFCOUNT_EXCEPTION counter:req + .pushsection .text..refcount +111: lea \counter, %_ASM_CX +112: ud2 + ASM_UNREACHABLE + .popsection +113: _ASM_EXTABLE_REFCOUNT(112b, 113b) +.endm + +/* Trigger refcount exception if refcount result is negative. */ +.macro REFCOUNT_CHECK_LT_ZERO counter:req + js 111f + REFCOUNT_EXCEPTION counter="\counter" +.endm + +/* Trigger refcount exception if refcount result is zero or negative. */ +.macro REFCOUNT_CHECK_LE_ZERO counter:req + jz 111f + REFCOUNT_CHECK_LT_ZERO counter="\counter" +.endm + +/* Trigger refcount exception unconditionally. */ +.macro REFCOUNT_ERROR counter:req + jmp 111f + REFCOUNT_EXCEPTION counter="\counter" +.endm + +#else /* __ASSEMBLY__ */ + #include <linux/refcount.h> #include <asm/bug.h> @@ -15,34 +50,11 @@ * central refcount exception. The fixup address for the exception points * back to the regular execution flow in .text. */ -#define _REFCOUNT_EXCEPTION \ - ".pushsection .text..refcount\n" \ - "111:\tlea %[counter], %%" _ASM_CX "\n" \ - "112:\t" ASM_UD2 "\n" \ - ASM_UNREACHABLE \ - ".popsection\n" \ - "113:\n" \ - _ASM_EXTABLE_REFCOUNT(112b, 113b) - -/* Trigger refcount exception if refcount result is negative. */ -#define REFCOUNT_CHECK_LT_ZERO \ - "js 111f\n\t" \ - _REFCOUNT_EXCEPTION - -/* Trigger refcount exception if refcount result is zero or negative. */ -#define REFCOUNT_CHECK_LE_ZERO \ - "jz 111f\n\t" \ - REFCOUNT_CHECK_LT_ZERO - -/* Trigger refcount exception unconditionally. */ -#define REFCOUNT_ERROR \ - "jmp 111f\n\t" \ - _REFCOUNT_EXCEPTION static __always_inline void refcount_add(unsigned int i, refcount_t *r) { asm volatile(LOCK_PREFIX "addl %1,%0\n\t" - REFCOUNT_CHECK_LT_ZERO + "REFCOUNT_CHECK_LT_ZERO counter=\"%[counter]\"" : [counter] "+m" (r->refs.counter) : "ir" (i) : "cc", "cx"); @@ -51,7 +63,7 @@ static __always_inline void refcount_add(unsigned int i, refcount_t *r) static __always_inline void refcount_inc(refcount_t *r) { asm volatile(LOCK_PREFIX "incl %0\n\t" - REFCOUNT_CHECK_LT_ZERO + "REFCOUNT_CHECK_LT_ZERO counter=\"%[counter]\"" : [counter] "+m" (r->refs.counter) : : "cc", "cx"); } @@ -59,7 +71,7 @@ static __always_inline void refcount_inc(refcount_t *r) static __always_inline void refcount_dec(refcount_t *r) { asm volatile(LOCK_PREFIX "decl %0\n\t" - REFCOUNT_CHECK_LE_ZERO + "REFCOUNT_CHECK_LE_ZERO counter=\"%[counter]\"" : [counter] "+m" (r->refs.counter) : : "cc", "cx"); } @@ -67,14 +79,17 @@ static __always_inline void refcount_dec(refcount_t *r) static __always_inline __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r) { - GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO, - r->refs.counter, "er", i, "%0", e, "cx"); + + return GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", + "REFCOUNT_CHECK_LT_ZERO counter=\"%[var]\"", + r->refs.counter, e, "er", i, "cx"); } static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r) { - GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO, - r->refs.counter, "%0", e, "cx"); + return GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", + "REFCOUNT_CHECK_LT_ZERO counter=\"%[var]\"", + r->refs.counter, e, "cx"); } static __always_inline __must_check @@ -91,7 +106,7 @@ bool refcount_add_not_zero(unsigned int i, refcount_t *r) /* Did we try to increment from/to an undesirable state? */ if (unlikely(c < 0 || c == INT_MAX || result < c)) { - asm volatile(REFCOUNT_ERROR + asm volatile("REFCOUNT_ERROR counter=\"%[counter]\"" : : [counter] "m" (r->refs.counter) : "cc", "cx"); break; @@ -107,4 +122,6 @@ static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r) return refcount_add_not_zero(1, r); } +#endif /* __ASSEMBLY__ */ + #endif diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 4914a3e7c803..46ac84b506f5 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -2,56 +2,69 @@ #ifndef _ASM_X86_RMWcc #define _ASM_X86_RMWcc +/* This counts to 12. Any more, it will return 13th argument. */ +#define __RMWcc_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n +#define RMWcc_ARGS(X...) __RMWcc_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) + +#define __RMWcc_CONCAT(a, b) a ## b +#define RMWcc_CONCAT(a, b) __RMWcc_CONCAT(a, b) + #define __CLOBBERS_MEM(clb...) "memory", ## clb #if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO) /* Use asm goto */ -#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ -do { \ +#define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \ +({ \ + bool c = false; \ asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ - : : [counter] "m" (var), ## __VA_ARGS__ \ + : : [var] "m" (_var), ## __VA_ARGS__ \ : clobbers : cc_label); \ - return 0; \ -cc_label: \ - return 1; \ -} while (0) - -#define __BINARY_RMWcc_ARG " %1, " - + if (0) { \ +cc_label: c = true; \ + } \ + c; \ +}) #else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ /* Use flags output or a set instruction */ -#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ -do { \ +#define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \ +({ \ bool c; \ asm volatile (fullop CC_SET(cc) \ - : [counter] "+m" (var), CC_OUT(cc) (c) \ + : [var] "+m" (_var), CC_OUT(cc) (c) \ : __VA_ARGS__ : clobbers); \ - return c; \ -} while (0) - -#define __BINARY_RMWcc_ARG " %2, " + c; \ +}) #endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ -#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ +#define GEN_UNARY_RMWcc_4(op, var, cc, arg0) \ __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM()) -#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc, clobbers...)\ - __GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \ - __CLOBBERS_MEM(clobbers)) +#define GEN_UNARY_RMWcc_3(op, var, cc) \ + GEN_UNARY_RMWcc_4(op, var, cc, "%[var]") -#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \ - __CLOBBERS_MEM(), vcon (val)) +#define GEN_UNARY_RMWcc(X...) RMWcc_CONCAT(GEN_UNARY_RMWcc_, RMWcc_ARGS(X))(X) + +#define GEN_BINARY_RMWcc_6(op, var, cc, vcon, _val, arg0) \ + __GEN_RMWcc(op " %[val], " arg0, var, cc, \ + __CLOBBERS_MEM(), [val] vcon (_val)) + +#define GEN_BINARY_RMWcc_5(op, var, cc, vcon, val) \ + GEN_BINARY_RMWcc_6(op, var, cc, vcon, val, "%[var]") + +#define GEN_BINARY_RMWcc(X...) RMWcc_CONCAT(GEN_BINARY_RMWcc_, RMWcc_ARGS(X))(X) + +#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, cc, clobbers...) \ + __GEN_RMWcc(op " %[var]\n\t" suffix, var, cc, \ + __CLOBBERS_MEM(clobbers)) -#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc, \ - clobbers...) \ - __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \ - __CLOBBERS_MEM(clobbers), vcon (val)) +#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, cc, vcon, _val, clobbers...)\ + __GEN_RMWcc(op " %[val], %[var]\n\t" suffix, var, cc, \ + __CLOBBERS_MEM(clobbers), [val] vcon (_val)) #endif /* _ASM_X86_RMWcc */ diff --git a/arch/x86/include/asm/suspend.h b/arch/x86/include/asm/suspend.h index ecffe81ff65c..a892494ca5e4 100644 --- a/arch/x86/include/asm/suspend.h +++ b/arch/x86/include/asm/suspend.h @@ -4,3 +4,11 @@ #else # include <asm/suspend_64.h> #endif +extern unsigned long restore_jump_address __visible; +extern unsigned long jump_address_phys; +extern unsigned long restore_cr3 __visible; +extern unsigned long temp_pgt __visible; +extern unsigned long relocated_restore_code __visible; +extern int relocate_restore_code(void); +/* Defined in hibernate_asm_32/64.S */ +extern asmlinkage __visible int restore_image(void); diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 8be6afb58471..fdbd9d7b7bca 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -32,4 +32,8 @@ struct saved_context { unsigned long return_address; } __attribute__((packed)); +/* routines for saving/restoring kernel state */ +extern char core_restore_code[]; +extern char restore_registers[]; + #endif /* _ASM_X86_SUSPEND_32_H */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index aae77eb8491c..b5e58cc0c5e7 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -198,8 +198,8 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) "4: movl %3,%0\n" \ " jmp 3b\n" \ ".previous\n" \ - _ASM_EXTABLE(1b, 4b) \ - _ASM_EXTABLE(2b, 4b) \ + _ASM_EXTABLE_UA(1b, 4b) \ + _ASM_EXTABLE_UA(2b, 4b) \ : "=r" (err) \ : "A" (x), "r" (addr), "i" (errret), "0" (err)) @@ -340,8 +340,8 @@ do { \ " xorl %%edx,%%edx\n" \ " jmp 3b\n" \ ".previous\n" \ - _ASM_EXTABLE(1b, 4b) \ - _ASM_EXTABLE(2b, 4b) \ + _ASM_EXTABLE_UA(1b, 4b) \ + _ASM_EXTABLE_UA(2b, 4b) \ : "=r" (retval), "=&A"(x) \ : "m" (__m(__ptr)), "m" __m(((u32 __user *)(__ptr)) + 1), \ "i" (errret), "0" (retval)); \ @@ -386,7 +386,7 @@ do { \ " xor"itype" %"rtype"1,%"rtype"1\n" \ " jmp 2b\n" \ ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "=r" (err), ltype(x) \ : "m" (__m(addr)), "i" (errret), "0" (err)) @@ -398,7 +398,7 @@ do { \ "3: mov %3,%0\n" \ " jmp 2b\n" \ ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "=r" (err), ltype(x) \ : "m" (__m(addr)), "i" (errret), "0" (err)) @@ -474,7 +474,7 @@ struct __large_struct { unsigned long buf[100]; }; "3: mov %3,%0\n" \ " jmp 2b\n" \ ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "=r"(err) \ : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err)) @@ -602,7 +602,7 @@ extern void __cmpxchg_wrong_size(void) "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ "\t.previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ : "i" (-EFAULT), "q" (__new), "1" (__old) \ : "memory" \ @@ -618,7 +618,7 @@ extern void __cmpxchg_wrong_size(void) "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ "\t.previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ : "i" (-EFAULT), "r" (__new), "1" (__old) \ : "memory" \ @@ -634,7 +634,7 @@ extern void __cmpxchg_wrong_size(void) "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ "\t.previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ : "i" (-EFAULT), "r" (__new), "1" (__old) \ : "memory" \ @@ -653,7 +653,7 @@ extern void __cmpxchg_wrong_size(void) "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ "\t.previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ : "i" (-EFAULT), "r" (__new), "1" (__old) \ : "memory" \ diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index d383140e1dc8..068d9b067c83 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_XEN_EVENTS_H #define _ASM_X86_XEN_EVENTS_H +#include <xen/xen.h> + enum ipi_vector { XEN_RESCHEDULE_VECTOR, XEN_CALL_FUNCTION_VECTOR, diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index f299d8a479bb..3f9d1b4019bb 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -482,7 +482,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, { void *vaddr; - vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs); + vaddr = dma_direct_alloc_pages(dev, size, dma_addr, flag, attrs); if (!vaddr || !force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24)) return vaddr; @@ -494,7 +494,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, goto out_free; return vaddr; out_free: - dma_direct_free(dev, size, vaddr, *dma_addr, attrs); + dma_direct_free_pages(dev, size, vaddr, *dma_addr, attrs); return NULL; } @@ -504,7 +504,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, unsigned long attrs) { gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0); - dma_direct_free(dev, size, vaddr, dma_addr, attrs); + dma_direct_free_pages(dev, size, vaddr, dma_addr, attrs); } static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 7654febd5102..652e7ffa9b9d 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -313,14 +313,13 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest) struct apic_chip_data *apicd = apic_chip_data(irqd); int vector, cpu; - cpumask_and(vector_searchmask, vector_searchmask, affmsk); - cpu = cpumask_first(vector_searchmask); - if (cpu >= nr_cpu_ids) - return -EINVAL; + cpumask_and(vector_searchmask, dest, affmsk); + /* set_affinity might call here for nothing */ if (apicd->vector && cpumask_test_cpu(apicd->cpu, vector_searchmask)) return 0; - vector = irq_matrix_alloc_managed(vector_matrix, cpu); + vector = irq_matrix_alloc_managed(vector_matrix, vector_searchmask, + &cpu); trace_vector_alloc_managed(irqd->irq, vector, vector); if (vector < 0) return vector; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7da587f4af52..0b99a7fae6be 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -949,11 +949,11 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) } static const __initconst struct x86_cpu_id cpu_no_speculation[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL_TABLET, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_BONNELL_MID, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL_MID, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_BONNELL, X86_FEATURE_ANY }, { X86_VENDOR_CENTAUR, 5 }, { X86_VENDOR_INTEL, 5 }, { X86_VENDOR_NSC, 5 }, @@ -968,10 +968,10 @@ static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { /* Only list CPUs which speculate but are non susceptible to SSB */ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_X }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_MID }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, @@ -984,14 +984,14 @@ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { static const __initconst struct x86_cpu_id cpu_no_l1tf[] = { /* in addition to cpu_no_speculation */ - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_X }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_MID }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT_MID }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_X }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_PLUS }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, {} diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index abb71ac70443..44272b7107ad 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -485,9 +485,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) size_t tsize; if (is_llc_occupancy_enabled()) { - d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid), - sizeof(unsigned long), - GFP_KERNEL); + d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); @@ -496,7 +494,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) tsize = sizeof(*d->mbm_total); d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); if (!d->mbm_total) { - kfree(d->rmid_busy_llc); + bitmap_free(d->rmid_busy_llc); return -ENOMEM; } } @@ -504,7 +502,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) tsize = sizeof(*d->mbm_local); d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); if (!d->mbm_local) { - kfree(d->rmid_busy_llc); + bitmap_free(d->rmid_busy_llc); kfree(d->mbm_total); return -ENOMEM; } @@ -610,9 +608,16 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cancel_delayed_work(&d->cqm_limbo); } + /* + * rdt_domain "d" is going to be freed below, so clear + * its pointer from pseudo_lock_region struct. + */ + if (d->plr) + d->plr->d = NULL; + kfree(d->ctrl_val); kfree(d->mbps_val); - kfree(d->rmid_busy_llc); + bitmap_free(d->rmid_busy_llc); kfree(d->mbm_total); kfree(d->mbm_local); kfree(d); diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 285eb3ec4200..3736f6dc9545 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -529,14 +529,14 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, int rdtgroup_schemata_show(struct kernfs_open_file *of, struct seq_file *s, void *v); bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, - u32 _cbm, int closid, bool exclusive); + unsigned long cbm, int closid, bool exclusive); unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d, - u32 cbm); + unsigned long cbm); enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); int rdtgroup_tasks_assigned(struct rdtgroup *r); int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm); +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm); bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d); int rdt_pseudo_lock_init(void); void rdt_pseudo_lock_release(void); diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 0f53049719cd..27937458c231 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -404,8 +404,16 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, for_each_alloc_enabled_rdt_resource(r) seq_printf(s, "%s:uninitialized\n", r->name); } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - seq_printf(s, "%s:%d=%x\n", rdtgrp->plr->r->name, - rdtgrp->plr->d->id, rdtgrp->plr->cbm); + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%s:%d=%x\n", + rdtgrp->plr->r->name, + rdtgrp->plr->d->id, + rdtgrp->plr->cbm); + } } else { closid = rdtgrp->closid; for_each_alloc_enabled_rdt_resource(r) { diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index 40f3903ae5d9..815b4e92522c 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -17,6 +17,7 @@ #include <linux/debugfs.h> #include <linux/kthread.h> #include <linux/mman.h> +#include <linux/perf_event.h> #include <linux/pm_qos.h> #include <linux/slab.h> #include <linux/uaccess.h> @@ -26,6 +27,7 @@ #include <asm/intel_rdt_sched.h> #include <asm/perf_event.h> +#include "../../events/perf_event.h" /* For X86_CONFIG() */ #include "intel_rdt.h" #define CREATE_TRACE_POINTS @@ -91,7 +93,7 @@ static u64 get_prefetch_disable_bits(void) */ return 0xF; case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GEMINI_LAKE: + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: /* * SDM defines bits of MSR_MISC_FEATURE_CONTROL register * as: @@ -106,16 +108,6 @@ static u64 get_prefetch_disable_bits(void) return 0; } -/* - * Helper to write 64bit value to MSR without tracing. Used when - * use of the cache should be restricted and use of registers used - * for local variables avoided. - */ -static inline void pseudo_wrmsrl_notrace(unsigned int msr, u64 val) -{ - __wrmsr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32)); -} - /** * pseudo_lock_minor_get - Obtain available minor number * @minor: Pointer to where new minor number will be stored @@ -797,25 +789,27 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) /** * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked * @d: RDT domain - * @_cbm: CBM to test + * @cbm: CBM to test * - * @d represents a cache instance and @_cbm a capacity bitmask that is - * considered for it. Determine if @_cbm overlaps with any existing + * @d represents a cache instance and @cbm a capacity bitmask that is + * considered for it. Determine if @cbm overlaps with any existing * pseudo-locked region on @d. * - * Return: true if @_cbm overlaps with pseudo-locked region on @d, false + * @cbm is unsigned long, even if only 32 bits are used, to make the + * bitmap functions work correctly. + * + * Return: true if @cbm overlaps with pseudo-locked region on @d, false * otherwise. */ -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm) +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm) { - unsigned long *cbm = (unsigned long *)&_cbm; - unsigned long *cbm_b; unsigned int cbm_len; + unsigned long cbm_b; if (d->plr) { cbm_len = d->plr->r->cache.cbm_len; - cbm_b = (unsigned long *)&d->plr->cbm; - if (bitmap_intersects(cbm, cbm_b, cbm_len)) + cbm_b = d->plr->cbm; + if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) return true; } return false; @@ -886,31 +880,14 @@ static int measure_cycles_lat_fn(void *_plr) struct pseudo_lock_region *plr = _plr; unsigned long i; u64 start, end; -#ifdef CONFIG_KASAN - /* - * The registers used for local register variables are also used - * when KASAN is active. When KASAN is active we use a regular - * variable to ensure we always use a valid pointer to access memory. - * The cost is that accessing this pointer, which could be in - * cache, will be included in the measurement of memory read latency. - */ void *mem_r; -#else -#ifdef CONFIG_X86_64 - register void *mem_r asm("rbx"); -#else - register void *mem_r asm("ebx"); -#endif /* CONFIG_X86_64 */ -#endif /* CONFIG_KASAN */ local_irq_disable(); /* - * The wrmsr call may be reordered with the assignment below it. - * Call wrmsr as directly as possible to avoid tracing clobbering - * local register variable used for memory pointer. + * Disable hardware prefetchers. */ - __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); - mem_r = plr->kmem; + wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + mem_r = READ_ONCE(plr->kmem); /* * Dummy execute of the time measurement to load the needed * instructions into the L1 instruction cache. @@ -932,157 +909,240 @@ static int measure_cycles_lat_fn(void *_plr) return 0; } -static int measure_cycles_perf_fn(void *_plr) +/* + * Create a perf_event_attr for the hit and miss perf events that will + * be used during the performance measurement. A perf_event maintains + * a pointer to its perf_event_attr so a unique attribute structure is + * created for each perf_event. + * + * The actual configuration of the event is set right before use in order + * to use the X86_CONFIG macro. + */ +static struct perf_event_attr perf_miss_attr = { + .type = PERF_TYPE_RAW, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + .exclude_user = 1, +}; + +static struct perf_event_attr perf_hit_attr = { + .type = PERF_TYPE_RAW, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + .exclude_user = 1, +}; + +struct residency_counts { + u64 miss_before, hits_before; + u64 miss_after, hits_after; +}; + +static int measure_residency_fn(struct perf_event_attr *miss_attr, + struct perf_event_attr *hit_attr, + struct pseudo_lock_region *plr, + struct residency_counts *counts) { - unsigned long long l3_hits = 0, l3_miss = 0; - u64 l3_hit_bits = 0, l3_miss_bits = 0; - struct pseudo_lock_region *plr = _plr; - unsigned long long l2_hits, l2_miss; - u64 l2_hit_bits, l2_miss_bits; - unsigned long i; -#ifdef CONFIG_KASAN - /* - * The registers used for local register variables are also used - * when KASAN is active. When KASAN is active we use regular variables - * at the cost of including cache access latency to these variables - * in the measurements. - */ + u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0; + struct perf_event *miss_event, *hit_event; + int hit_pmcnum, miss_pmcnum; unsigned int line_size; unsigned int size; + unsigned long i; void *mem_r; -#else - register unsigned int line_size asm("esi"); - register unsigned int size asm("edi"); -#ifdef CONFIG_X86_64 - register void *mem_r asm("rbx"); -#else - register void *mem_r asm("ebx"); -#endif /* CONFIG_X86_64 */ -#endif /* CONFIG_KASAN */ + u64 tmp; + + miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu, + NULL, NULL, NULL); + if (IS_ERR(miss_event)) + goto out; + + hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu, + NULL, NULL, NULL); + if (IS_ERR(hit_event)) + goto out_miss; + + local_irq_disable(); + /* + * Check any possible error state of events used by performing + * one local read. + */ + if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) { + local_irq_enable(); + goto out_hit; + } + if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) { + local_irq_enable(); + goto out_hit; + } + + /* + * Disable hardware prefetchers. + */ + wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + + /* Initialize rest of local variables */ + /* + * Performance event has been validated right before this with + * interrupts disabled - it is thus safe to read the counter index. + */ + miss_pmcnum = x86_perf_rdpmc_index(miss_event); + hit_pmcnum = x86_perf_rdpmc_index(hit_event); + line_size = READ_ONCE(plr->line_size); + mem_r = READ_ONCE(plr->kmem); + size = READ_ONCE(plr->size); + + /* + * Read counter variables twice - first to load the instructions + * used in L1 cache, second to capture accurate value that does not + * include cache misses incurred because of instruction loads. + */ + rdpmcl(hit_pmcnum, hits_before); + rdpmcl(miss_pmcnum, miss_before); + /* + * From SDM: Performing back-to-back fast reads are not guaranteed + * to be monotonic. + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + rdpmcl(hit_pmcnum, hits_before); + rdpmcl(miss_pmcnum, miss_before); + /* + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + for (i = 0; i < size; i += line_size) { + /* + * Add a barrier to prevent speculative execution of this + * loop reading beyond the end of the buffer. + */ + rmb(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + } + /* + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + rdpmcl(hit_pmcnum, hits_after); + rdpmcl(miss_pmcnum, miss_after); + /* + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + /* Re-enable hardware prefetchers */ + wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + local_irq_enable(); +out_hit: + perf_event_release_kernel(hit_event); +out_miss: + perf_event_release_kernel(miss_event); +out: + /* + * All counts will be zero on failure. + */ + counts->miss_before = miss_before; + counts->hits_before = hits_before; + counts->miss_after = miss_after; + counts->hits_after = hits_after; + return 0; +} + +static int measure_l2_residency(void *_plr) +{ + struct pseudo_lock_region *plr = _plr; + struct residency_counts counts = {0}; /* * Non-architectural event for the Goldmont Microarchitecture * from Intel x86 Architecture Software Developer Manual (SDM): * MEM_LOAD_UOPS_RETIRED D1H (event number) * Umask values: - * L1_HIT 01H * L2_HIT 02H - * L1_MISS 08H * L2_MISS 10H - * - * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event - * has two "no fix" errata associated with it: BDM35 and BDM100. On - * this platform we use the following events instead: - * L2_RQSTS 24H (Documented in https://download.01.org/perfmon/BDW/) - * REFERENCES FFH - * MISS 3FH - * LONGEST_LAT_CACHE 2EH (Documented in SDM) - * REFERENCE 4FH - * MISS 41H */ - - /* - * Start by setting flags for IA32_PERFEVTSELx: - * OS (Operating system mode) 0x2 - * INT (APIC interrupt enable) 0x10 - * EN (Enable counter) 0x40 - * - * Then add the Umask value and event number to select performance - * event. - */ - switch (boot_cpu_data.x86_model) { case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GEMINI_LAKE: - l2_hit_bits = (0x52ULL << 16) | (0x2 << 8) | 0xd1; - l2_miss_bits = (0x52ULL << 16) | (0x10 << 8) | 0xd1; - break; - case INTEL_FAM6_BROADWELL_X: - /* On BDW the l2_hit_bits count references, not hits */ - l2_hit_bits = (0x52ULL << 16) | (0xff << 8) | 0x24; - l2_miss_bits = (0x52ULL << 16) | (0x3f << 8) | 0x24; - /* On BDW the l3_hit_bits count references, not hits */ - l3_hit_bits = (0x52ULL << 16) | (0x4f << 8) | 0x2e; - l3_miss_bits = (0x52ULL << 16) | (0x41 << 8) | 0x2e; + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + perf_miss_attr.config = X86_CONFIG(.event = 0xd1, + .umask = 0x10); + perf_hit_attr.config = X86_CONFIG(.event = 0xd1, + .umask = 0x2); break; default: goto out; } - local_irq_disable(); + measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); /* - * Call wrmsr direcly to avoid the local register variables from - * being overwritten due to reordering of their assignment with - * the wrmsr calls. + * If a failure prevented the measurements from succeeding + * tracepoints will still be written and all counts will be zero. */ - __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); - /* Disable events and reset counters */ - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 1, 0x0); - if (l3_hit_bits > 0) { - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 2, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 3, 0x0); - } - /* Set and enable the L2 counters */ - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, l2_hit_bits); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, l2_miss_bits); - if (l3_hit_bits > 0) { - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, - l3_hit_bits); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, - l3_miss_bits); - } - mem_r = plr->kmem; - size = plr->size; - line_size = plr->line_size; - for (i = 0; i < size; i += line_size) { - asm volatile("mov (%0,%1,1), %%eax\n\t" - : - : "r" (mem_r), "r" (i) - : "%eax", "memory"); - } + trace_pseudo_lock_l2(counts.hits_after - counts.hits_before, + counts.miss_after - counts.miss_before); +out: + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + +static int measure_l3_residency(void *_plr) +{ + struct pseudo_lock_region *plr = _plr; + struct residency_counts counts = {0}; + /* - * Call wrmsr directly (no tracing) to not influence - * the cache access counters as they are disabled. + * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event + * has two "no fix" errata associated with it: BDM35 and BDM100. On + * this platform the following events are used instead: + * LONGEST_LAT_CACHE 2EH (Documented in SDM) + * REFERENCE 4FH + * MISS 41H */ - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, - l2_hit_bits & ~(0x40ULL << 16)); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, - l2_miss_bits & ~(0x40ULL << 16)); - if (l3_hit_bits > 0) { - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, - l3_hit_bits & ~(0x40ULL << 16)); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, - l3_miss_bits & ~(0x40ULL << 16)); - } - l2_hits = native_read_pmc(0); - l2_miss = native_read_pmc(1); - if (l3_hit_bits > 0) { - l3_hits = native_read_pmc(2); - l3_miss = native_read_pmc(3); + + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_BROADWELL_X: + /* On BDW the hit event counts references, not hits */ + perf_hit_attr.config = X86_CONFIG(.event = 0x2e, + .umask = 0x4f); + perf_miss_attr.config = X86_CONFIG(.event = 0x2e, + .umask = 0x41); + break; + default: + goto out; } - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); - local_irq_enable(); + + measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); /* - * On BDW we count references and misses, need to adjust. Sometimes - * the "hits" counter is a bit more than the references, for - * example, x references but x + 1 hits. To not report invalid - * hit values in this case we treat that as misses eaqual to - * references. + * If a failure prevented the measurements from succeeding + * tracepoints will still be written and all counts will be zero. */ - if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) - l2_hits -= (l2_miss > l2_hits ? l2_hits : l2_miss); - trace_pseudo_lock_l2(l2_hits, l2_miss); - if (l3_hit_bits > 0) { - if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) - l3_hits -= (l3_miss > l3_hits ? l3_hits : l3_miss); - trace_pseudo_lock_l3(l3_hits, l3_miss); + + counts.miss_after -= counts.miss_before; + if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) { + /* + * On BDW references and misses are counted, need to adjust. + * Sometimes the "hits" counter is a bit more than the + * references, for example, x references but x + 1 hits. + * To not report invalid hit values in this case we treat + * that as misses equal to references. + */ + /* First compute the number of cache references measured */ + counts.hits_after -= counts.hits_before; + /* Next convert references to cache hits */ + counts.hits_after -= min(counts.miss_after, counts.hits_after); + } else { + counts.hits_after -= counts.hits_before; } + trace_pseudo_lock_l3(counts.hits_after, counts.miss_after); out: plr->thread_done = 1; wake_up_interruptible(&plr->lock_thread_wq); @@ -1114,6 +1174,11 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) goto out; } + if (!plr->d) { + ret = -ENODEV; + goto out; + } + plr->thread_done = 0; cpu = cpumask_first(&plr->d->cpu_mask); if (!cpu_online(cpu)) { @@ -1121,13 +1186,20 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) goto out; } + plr->cpu = cpu; + if (sel == 1) thread = kthread_create_on_node(measure_cycles_lat_fn, plr, cpu_to_node(cpu), "pseudo_lock_measure/%u", cpu); else if (sel == 2) - thread = kthread_create_on_node(measure_cycles_perf_fn, plr, + thread = kthread_create_on_node(measure_l2_residency, plr, + cpu_to_node(cpu), + "pseudo_lock_measure/%u", + cpu); + else if (sel == 3) + thread = kthread_create_on_node(measure_l3_residency, plr, cpu_to_node(cpu), "pseudo_lock_measure/%u", cpu); @@ -1171,7 +1243,7 @@ static ssize_t pseudo_lock_measure_trigger(struct file *file, buf[buf_size] = '\0'; ret = kstrtoint(buf, 10, &sel); if (ret == 0) { - if (sel != 1) + if (sel != 1 && sel != 2 && sel != 3) return -EINVAL; ret = debugfs_file_get(file->f_path.dentry); if (ret) @@ -1427,6 +1499,11 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) plr = rdtgrp->plr; + if (!plr->d) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + /* * Task is required to run with affinity to the cpus associated * with the pseudo-locked region. If this is not the case the task diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 1b8e86a5d5e1..f27b8115ffa2 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -268,17 +268,27 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct rdtgroup *rdtgrp; + struct cpumask *mask; int ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", - cpumask_pr_args(&rdtgrp->plr->d->cpu_mask)); - else + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + mask = &rdtgrp->plr->d->cpu_mask; + seq_printf(s, is_cpu_list(of) ? + "%*pbl\n" : "%*pb\n", + cpumask_pr_args(mask)); + } + } else { seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask)); + } } else { ret = -ENOENT; } @@ -961,7 +971,78 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of, } /** - * rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other + * rdt_cdp_peer_get - Retrieve CDP peer if it exists + * @r: RDT resource to which RDT domain @d belongs + * @d: Cache instance for which a CDP peer is requested + * @r_cdp: RDT resource that shares hardware with @r (RDT resource peer) + * Used to return the result. + * @d_cdp: RDT domain that shares hardware with @d (RDT domain peer) + * Used to return the result. + * + * RDT resources are managed independently and by extension the RDT domains + * (RDT resource instances) are managed independently also. The Code and + * Data Prioritization (CDP) RDT resources, while managed independently, + * could refer to the same underlying hardware. For example, + * RDT_RESOURCE_L2CODE and RDT_RESOURCE_L2DATA both refer to the L2 cache. + * + * When provided with an RDT resource @r and an instance of that RDT + * resource @d rdt_cdp_peer_get() will return if there is a peer RDT + * resource and the exact instance that shares the same hardware. + * + * Return: 0 if a CDP peer was found, <0 on error or if no CDP peer exists. + * If a CDP peer was found, @r_cdp will point to the peer RDT resource + * and @d_cdp will point to the peer RDT domain. + */ +static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d, + struct rdt_resource **r_cdp, + struct rdt_domain **d_cdp) +{ + struct rdt_resource *_r_cdp = NULL; + struct rdt_domain *_d_cdp = NULL; + int ret = 0; + + switch (r->rid) { + case RDT_RESOURCE_L3DATA: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE]; + break; + case RDT_RESOURCE_L3CODE: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3DATA]; + break; + case RDT_RESOURCE_L2DATA: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2CODE]; + break; + case RDT_RESOURCE_L2CODE: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2DATA]; + break; + default: + ret = -ENOENT; + goto out; + } + + /* + * When a new CPU comes online and CDP is enabled then the new + * RDT domains (if any) associated with both CDP RDT resources + * are added in the same CPU online routine while the + * rdtgroup_mutex is held. It should thus not happen for one + * RDT domain to exist and be associated with its RDT CDP + * resource but there is no RDT domain associated with the + * peer RDT CDP resource. Hence the WARN. + */ + _d_cdp = rdt_find_domain(_r_cdp, d->id, NULL); + if (WARN_ON(!_d_cdp)) { + _r_cdp = NULL; + ret = -EINVAL; + } + +out: + *r_cdp = _r_cdp; + *d_cdp = _d_cdp; + + return ret; +} + +/** + * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other * @r: Resource to which domain instance @d belongs. * @d: The domain instance for which @closid is being tested. * @cbm: Capacity bitmask being tested. @@ -975,33 +1056,34 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of, * is false then overlaps with any resource group or hardware entities * will be considered. * + * @cbm is unsigned long, even if only 32 bits are used, to make the + * bitmap functions work correctly. + * * Return: false if CBM does not overlap, true if it does. */ -bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, - u32 _cbm, int closid, bool exclusive) +static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, + unsigned long cbm, int closid, bool exclusive) { - unsigned long *cbm = (unsigned long *)&_cbm; - unsigned long *ctrl_b; enum rdtgrp_mode mode; + unsigned long ctrl_b; u32 *ctrl; int i; /* Check for any overlap with regions used by hardware directly */ if (!exclusive) { - if (bitmap_intersects(cbm, - (unsigned long *)&r->cache.shareable_bits, - r->cache.cbm_len)) + ctrl_b = r->cache.shareable_bits; + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) return true; } /* Check for overlap with other resource groups */ ctrl = d->ctrl_val; for (i = 0; i < closids_supported(); i++, ctrl++) { - ctrl_b = (unsigned long *)ctrl; + ctrl_b = *ctrl; mode = rdtgroup_mode_by_closid(i); if (closid_allocated(i) && i != closid && mode != RDT_MODE_PSEUDO_LOCKSETUP) { - if (bitmap_intersects(cbm, ctrl_b, r->cache.cbm_len)) { + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { if (exclusive) { if (mode == RDT_MODE_EXCLUSIVE) return true; @@ -1016,6 +1098,41 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, } /** + * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware + * @r: Resource to which domain instance @d belongs. + * @d: The domain instance for which @closid is being tested. + * @cbm: Capacity bitmask being tested. + * @closid: Intended closid for @cbm. + * @exclusive: Only check if overlaps with exclusive resource groups + * + * Resources that can be allocated using a CBM can use the CBM to control + * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test + * for overlap. Overlap test is not limited to the specific resource for + * which the CBM is intended though - when dealing with CDP resources that + * share the underlying hardware the overlap check should be performed on + * the CDP resource sharing the hardware also. + * + * Refer to description of __rdtgroup_cbm_overlaps() for the details of the + * overlap test. + * + * Return: true if CBM overlap detected, false if there is no overlap + */ +bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, + unsigned long cbm, int closid, bool exclusive) +{ + struct rdt_resource *r_cdp; + struct rdt_domain *d_cdp; + + if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, exclusive)) + return true; + + if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp) < 0) + return false; + + return __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, exclusive); +} + +/** * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive * * An exclusive resource group implies that there should be no sharing of @@ -1138,15 +1255,18 @@ out: * computed by first dividing the total cache size by the CBM length to * determine how many bytes each bit in the bitmask represents. The result * is multiplied with the number of bits set in the bitmask. + * + * @cbm is unsigned long, even if only 32 bits are used to make the + * bitmap functions work correctly. */ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, - struct rdt_domain *d, u32 cbm) + struct rdt_domain *d, unsigned long cbm) { struct cpu_cacheinfo *ci; unsigned int size = 0; int num_b, i; - num_b = bitmap_weight((unsigned long *)&cbm, r->cache.cbm_len); + num_b = bitmap_weight(&cbm, r->cache.cbm_len); ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask)); for (i = 0; i < ci->num_leaves; i++) { if (ci->info_list[i].level == r->cache_level) { @@ -1172,6 +1292,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, struct rdt_resource *r; struct rdt_domain *d; unsigned int size; + int ret = 0; bool sep; u32 ctrl; @@ -1182,11 +1303,18 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, } if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - seq_printf(s, "%*s:", max_name_width, rdtgrp->plr->r->name); - size = rdtgroup_cbm_to_size(rdtgrp->plr->r, - rdtgrp->plr->d, - rdtgrp->plr->cbm); - seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size); + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%*s:", max_name_width, + rdtgrp->plr->r->name); + size = rdtgroup_cbm_to_size(rdtgrp->plr->r, + rdtgrp->plr->d, + rdtgrp->plr->cbm); + seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size); + } goto out; } @@ -1216,7 +1344,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, out: rdtgroup_kn_unlock(of->kn); - return 0; + return ret; } /* rdtgroup information files for one cache resource. */ @@ -2350,13 +2478,16 @@ static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r) */ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) { + struct rdt_resource *r_cdp = NULL; + struct rdt_domain *d_cdp = NULL; u32 used_b = 0, unused_b = 0; u32 closid = rdtgrp->closid; struct rdt_resource *r; + unsigned long tmp_cbm; enum rdtgrp_mode mode; struct rdt_domain *d; + u32 peer_ctl, *ctrl; int i, ret; - u32 *ctrl; for_each_alloc_enabled_rdt_resource(r) { /* @@ -2366,6 +2497,7 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) if (r->rid == RDT_RESOURCE_MBA) continue; list_for_each_entry(d, &r->domains, list) { + rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp); d->have_new_ctrl = false; d->new_ctrl = r->cache.shareable_bits; used_b = r->cache.shareable_bits; @@ -2375,9 +2507,19 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) mode = rdtgroup_mode_by_closid(i); if (mode == RDT_MODE_PSEUDO_LOCKSETUP) break; - used_b |= *ctrl; + /* + * If CDP is active include peer + * domain's usage to ensure there + * is no overlap with an exclusive + * group. + */ + if (d_cdp) + peer_ctl = d_cdp->ctrl_val[i]; + else + peer_ctl = 0; + used_b |= *ctrl | peer_ctl; if (mode == RDT_MODE_SHAREABLE) - d->new_ctrl |= *ctrl; + d->new_ctrl |= *ctrl | peer_ctl; } } if (d->plr && d->plr->cbm > 0) @@ -2390,9 +2532,14 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) * modify the CBM based on system availability. */ cbm_ensure_valid(&d->new_ctrl, r); - if (bitmap_weight((unsigned long *) &d->new_ctrl, - r->cache.cbm_len) < - r->cache.min_cbm_bits) { + /* + * Assign the u32 CBM to an unsigned long to ensure + * that bitmap_weight() does not access out-of-bound + * memory. + */ + tmp_cbm = d->new_ctrl; + if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < + r->cache.min_cbm_bits) { rdt_last_cmd_printf("no space on %s:%d\n", r->name, d->id); return -ENOSPC; @@ -2795,6 +2942,13 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) { if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) seq_puts(seq, ",cdp"); + + if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) + seq_puts(seq, ",cdpl2"); + + if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA])) + seq_puts(seq, ",mba_MBps"); + return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c index 97685a0c3175..27f394ac983f 100644 --- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c @@ -38,9 +38,6 @@ static struct mce_log_buffer mcelog = { static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); -/* User mode helper program triggered by machine check event */ -extern char mce_helper[128]; - static int dev_mce_log(struct notifier_block *nb, unsigned long val, void *data) { diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index c805a06e14c3..1fc424c40a31 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -108,6 +108,9 @@ static void setup_inj_struct(struct mce *m) memset(m, 0, sizeof(struct mce)); m->cpuvendor = boot_cpu_data.x86_vendor; + m->time = ktime_get_real_seconds(); + m->cpuid = cpuid_eax(1); + m->microcode = boot_cpu_data.microcode; } /* Update fake mce registers on current CPU. */ @@ -576,6 +579,9 @@ static int inj_bank_set(void *data, u64 val) m->bank = val; do_inject(); + /* Reset injection struct */ + setup_inj_struct(&i_mce); + return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 953b3ce92dcc..ef8fd1f2ede0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1315,7 +1315,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) local_irq_disable(); ist_end_non_atomic(); } else { - if (!fixup_exception(regs, X86_TRAP_MC)) + if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) mce_panic("Failed kernel mode recovery", &m, NULL); } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 23f1691670b6..61a949d84dfa 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -314,7 +314,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) * thread's fpu state, reconstruct fxstate from the fsave * header. Validate and sanitize the copied state. */ - struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; int err = 0; diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index eeea935e9bb5..aac0c1f7e354 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -42,55 +42,40 @@ static void __ref __jump_label_transform(struct jump_entry *entry, void *(*poker)(void *, const void *, size_t), int init) { - union jump_code_union code; + union jump_code_union jmp; const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; + const void *expect, *code; + int line; + + jmp.jump = 0xe9; + jmp.offset = jump_entry_target(entry) - + (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); if (early_boot_irqs_disabled) poker = text_poke_early; if (type == JUMP_LABEL_JMP) { if (init) { - /* - * Jump label is enabled for the first time. - * So we expect a default_nop... - */ - if (unlikely(memcmp((void *)entry->code, default_nop, 5) - != 0)) - bug_at((void *)entry->code, __LINE__); + expect = default_nop; line = __LINE__; } else { - /* - * ...otherwise expect an ideal_nop. Otherwise - * something went horribly wrong. - */ - if (unlikely(memcmp((void *)entry->code, ideal_nop, 5) - != 0)) - bug_at((void *)entry->code, __LINE__); + expect = ideal_nop; line = __LINE__; } - code.jump = 0xe9; - code.offset = entry->target - - (entry->code + JUMP_LABEL_NOP_SIZE); + code = &jmp.code; } else { - /* - * We are disabling this jump label. If it is not what - * we think it is, then something must have gone wrong. - * If this is the first initialization call, then we - * are converting the default nop to the ideal nop. - */ if (init) { - if (unlikely(memcmp((void *)entry->code, default_nop, 5) != 0)) - bug_at((void *)entry->code, __LINE__); + expect = default_nop; line = __LINE__; } else { - code.jump = 0xe9; - code.offset = entry->target - - (entry->code + JUMP_LABEL_NOP_SIZE); - if (unlikely(memcmp((void *)entry->code, &code, 5) != 0)) - bug_at((void *)entry->code, __LINE__); + expect = &jmp.code; line = __LINE__; } - memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); + + code = ideal_nop; } + if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE)) + bug_at((void *)jump_entry_code(entry), line); + /* * Make text_poke_bp() a default fallback poker. * @@ -99,11 +84,14 @@ static void __ref __jump_label_transform(struct jump_entry *entry, * always nop being the 'currently valid' instruction * */ - if (poker) - (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); - else - text_poke_bp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE, - (void *)entry->code + JUMP_LABEL_NOP_SIZE); + if (poker) { + (*poker)((void *)jump_entry_code(entry), code, + JUMP_LABEL_NOP_SIZE); + return; + } + + text_poke_bp((void *)jump_entry_code(entry), code, JUMP_LABEL_NOP_SIZE, + (void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); } void arch_jump_label_transform(struct jump_entry *entry, diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index b0d1e81c96bb..f72a47b602e2 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1020,50 +1020,12 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) */ if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) return 1; - - /* - * In case the user-specified fault handler returned - * zero, try to fix up. - */ - if (fixup_exception(regs, trapnr)) - return 1; - - /* - * fixup routine could not handle it, - * Let do_page_fault() fix it. - */ } return 0; } NOKPROBE_SYMBOL(kprobe_fault_handler); -/* - * Wrapper routine for handling exceptions. - */ -int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, - void *data) -{ - struct die_args *args = data; - int ret = NOTIFY_DONE; - - if (args->regs && user_mode(args->regs)) - return ret; - - if (val == DIE_GPF) { - /* - * To be potentially processing a kprobe fault and to - * trust the result from kprobe_running(), we have - * be non-preemptible. - */ - if (!preemptible() && kprobe_running() && - kprobe_fault_handler(args->regs, args->trapnr)) - ret = NOTIFY_STOP; - } - return ret; -} -NOKPROBE_SYMBOL(kprobe_exceptions_notify); - bool arch_within_kprobe_blacklist(unsigned long addr) { bool is_in_entry_trampoline_section = false; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index eaf02f2e7300..40b16b270656 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -179,7 +179,7 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) opt_pre_handler(&op->kp, regs); __this_cpu_write(current_kprobe, NULL); } - preempt_enable_no_resched(); + preempt_enable(); } NOKPROBE_SYMBOL(optimized_callback); diff --git a/arch/x86/kernel/macros.S b/arch/x86/kernel/macros.S new file mode 100644 index 000000000000..161c95059044 --- /dev/null +++ b/arch/x86/kernel/macros.S @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * This file includes headers whose assembly part includes macros which are + * commonly used. The macros are precompiled into assmebly file which is later + * assembled together with each compiled file. + */ + +#include <linux/compiler.h> +#include <asm/refcount.h> +#include <asm/alternative-asm.h> +#include <asm/bug.h> +#include <asm/paravirt.h> +#include <asm/asm.h> +#include <asm/cpufeature.h> +#include <asm/jump_label.h> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f58336af095c..b052e883dd8c 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -201,6 +201,12 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, goto overflow; #endif break; + case R_X86_64_PC64: + if (*(u64 *)loc != 0) + goto invalid_relocation; + val -= (u64)loc; + *(u64 *)loc = val; + break; default: pr_err("%s: Unknown rela relocation: %llu\n", me->name, ELF64_R_TYPE(rel[i].r_info)); diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 661583662430..71c0b01d93b1 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -42,10 +42,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override, int __init pci_swiotlb_detect_4gb(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ -#ifdef CONFIG_X86_64 if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) swiotlb = 1; -#endif /* * If SME is active then swiotlb will be set to 1 so that bounce diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4866badb235..90ecc108bc8a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1251,7 +1251,7 @@ void __init setup_arch(char **cmdline_p) x86_init.hyper.guest_late_init(); e820__reserve_resources(); - e820__register_nosave_regions(max_low_pfn); + e820__register_nosave_regions(max_pfn); x86_init.resources.reserve_resources(); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index be01328eb755..fddaefc51fb6 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -25,7 +25,7 @@ #include <asm/time.h> #ifdef CONFIG_X86_64 -__visible volatile unsigned long jiffies __cacheline_aligned = INITIAL_JIFFIES; +__visible volatile unsigned long jiffies __cacheline_aligned_in_smp = INITIAL_JIFFIES; #endif unsigned long profile_pc(struct pt_regs *regs) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index e6db475164ed..16c95cb90496 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -206,7 +206,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, } if (!user_mode(regs)) { - if (fixup_exception(regs, trapnr)) + if (fixup_exception(regs, trapnr, error_code, 0)) return 0; tsk->thread.error_code = error_code; @@ -551,11 +551,21 @@ do_general_protection(struct pt_regs *regs, long error_code) tsk = current; if (!user_mode(regs)) { - if (fixup_exception(regs, X86_TRAP_GP)) + if (fixup_exception(regs, X86_TRAP_GP, error_code, 0)) return; tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; + + /* + * To be potentially processing a kprobe fault and to + * trust the result from kprobe_running(), we have to + * be non-preemptible. + */ + if (!preemptible() && kprobe_running() && + kprobe_fault_handler(regs, X86_TRAP_GP)) + return; + if (notify_die(DIE_GPF, "general protection fault", regs, error_code, X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) die("general protection fault", regs, error_code); @@ -838,7 +848,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) cond_local_irq_enable(regs); if (!user_mode(regs)) { - if (fixup_exception(regs, trapnr)) + if (fixup_exception(regs, trapnr, error_code, 0)) return; task->thread.error_code = error_code; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index b52bd2b6cdb4..03b7529333a6 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -58,7 +58,7 @@ struct cyc2ns { static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); -void cyc2ns_read_begin(struct cyc2ns_data *data) +void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data) { int seq, idx; @@ -75,7 +75,7 @@ void cyc2ns_read_begin(struct cyc2ns_data *data) } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); } -void cyc2ns_read_end(void) +void __always_inline cyc2ns_read_end(void) { preempt_enable_notrace(); } @@ -104,7 +104,7 @@ void cyc2ns_read_end(void) * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static inline unsigned long long cycles_2_ns(unsigned long long cyc) +static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; unsigned long long ns; @@ -636,7 +636,7 @@ unsigned long native_calibrate_tsc(void) case INTEL_FAM6_KABYLAKE_DESKTOP: crystal_khz = 24000; /* 24.0 MHz */ break; - case INTEL_FAM6_ATOM_DENVERTON: + case INTEL_FAM6_ATOM_GOLDMONT_X: crystal_khz = 25000; /* 25.0 MHz */ break; case INTEL_FAM6_ATOM_GOLDMONT: diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 27ef714d886c..3d0e9aeea7c8 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -59,12 +59,12 @@ static const struct freq_desc freq_desc_ann = { }; static const struct x86_cpu_id tsc_msr_cpu_ids[] = { - INTEL_CPU_FAM6(ATOM_PENWELL, freq_desc_pnw), - INTEL_CPU_FAM6(ATOM_CLOVERVIEW, freq_desc_clv), - INTEL_CPU_FAM6(ATOM_SILVERMONT1, freq_desc_byt), + INTEL_CPU_FAM6(ATOM_SALTWELL_MID, freq_desc_pnw), + INTEL_CPU_FAM6(ATOM_SALTWELL_TABLET, freq_desc_clv), + INTEL_CPU_FAM6(ATOM_SILVERMONT, freq_desc_byt), + INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, freq_desc_tng), INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht), - INTEL_CPU_FAM6(ATOM_MERRIFIELD, freq_desc_tng), - INTEL_CPU_FAM6(ATOM_MOOREFIELD, freq_desc_ann), + INTEL_CPU_FAM6(ATOM_AIRMONT_MID, freq_desc_ann), {} }; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d96092b35936..61ccfb13899e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -436,14 +436,18 @@ static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) static inline bool svm_sev_enabled(void) { - return max_sev_asid; + return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0; } static inline bool sev_guest(struct kvm *kvm) { +#ifdef CONFIG_KVM_AMD_SEV struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; return sev->active; +#else + return false; +#endif } static inline int sev_get_asid(struct kvm *kvm) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 612fd17be635..e665aa7167cf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1572,8 +1572,12 @@ static int vmx_hv_remote_flush_tlb(struct kvm *kvm) goto out; } + /* + * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the + * base of EPT PML4 table, strip off EPT configuration information. + */ ret = hyperv_flush_guest_mapping( - to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer); + to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK); out: spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 46e71a74e612..ad8e0906d1ea 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -273,11 +273,11 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst, #define SRC(y...) \ 9999: y; \ - _ASM_EXTABLE(9999b, 6001f) + _ASM_EXTABLE_UA(9999b, 6001f) #define DST(y...) \ 9999: y; \ - _ASM_EXTABLE(9999b, 6002f) + _ASM_EXTABLE_UA(9999b, 6002f) #ifndef CONFIG_X86_USE_PPRO_CHECKSUM diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 020f75cc8cf6..db4e5aa0858b 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -92,26 +92,26 @@ ENTRY(copy_user_generic_unrolled) 60: jmp copy_user_handle_tail /* ecx is zerorest also */ .previous - _ASM_EXTABLE(1b,30b) - _ASM_EXTABLE(2b,30b) - _ASM_EXTABLE(3b,30b) - _ASM_EXTABLE(4b,30b) - _ASM_EXTABLE(5b,30b) - _ASM_EXTABLE(6b,30b) - _ASM_EXTABLE(7b,30b) - _ASM_EXTABLE(8b,30b) - _ASM_EXTABLE(9b,30b) - _ASM_EXTABLE(10b,30b) - _ASM_EXTABLE(11b,30b) - _ASM_EXTABLE(12b,30b) - _ASM_EXTABLE(13b,30b) - _ASM_EXTABLE(14b,30b) - _ASM_EXTABLE(15b,30b) - _ASM_EXTABLE(16b,30b) - _ASM_EXTABLE(18b,40b) - _ASM_EXTABLE(19b,40b) - _ASM_EXTABLE(21b,50b) - _ASM_EXTABLE(22b,50b) + _ASM_EXTABLE_UA(1b, 30b) + _ASM_EXTABLE_UA(2b, 30b) + _ASM_EXTABLE_UA(3b, 30b) + _ASM_EXTABLE_UA(4b, 30b) + _ASM_EXTABLE_UA(5b, 30b) + _ASM_EXTABLE_UA(6b, 30b) + _ASM_EXTABLE_UA(7b, 30b) + _ASM_EXTABLE_UA(8b, 30b) + _ASM_EXTABLE_UA(9b, 30b) + _ASM_EXTABLE_UA(10b, 30b) + _ASM_EXTABLE_UA(11b, 30b) + _ASM_EXTABLE_UA(12b, 30b) + _ASM_EXTABLE_UA(13b, 30b) + _ASM_EXTABLE_UA(14b, 30b) + _ASM_EXTABLE_UA(15b, 30b) + _ASM_EXTABLE_UA(16b, 30b) + _ASM_EXTABLE_UA(18b, 40b) + _ASM_EXTABLE_UA(19b, 40b) + _ASM_EXTABLE_UA(21b, 50b) + _ASM_EXTABLE_UA(22b, 50b) ENDPROC(copy_user_generic_unrolled) EXPORT_SYMBOL(copy_user_generic_unrolled) @@ -156,8 +156,8 @@ ENTRY(copy_user_generic_string) jmp copy_user_handle_tail .previous - _ASM_EXTABLE(1b,11b) - _ASM_EXTABLE(3b,12b) + _ASM_EXTABLE_UA(1b, 11b) + _ASM_EXTABLE_UA(3b, 12b) ENDPROC(copy_user_generic_string) EXPORT_SYMBOL(copy_user_generic_string) @@ -189,7 +189,7 @@ ENTRY(copy_user_enhanced_fast_string) jmp copy_user_handle_tail .previous - _ASM_EXTABLE(1b,12b) + _ASM_EXTABLE_UA(1b, 12b) ENDPROC(copy_user_enhanced_fast_string) EXPORT_SYMBOL(copy_user_enhanced_fast_string) @@ -319,27 +319,27 @@ ENTRY(__copy_user_nocache) jmp copy_user_handle_tail .previous - _ASM_EXTABLE(1b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(2b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(3b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(4b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(5b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(6b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(7b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(8b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(9b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(10b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(11b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(12b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(13b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(14b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(15b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(16b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(20b,.L_fixup_8b_copy) - _ASM_EXTABLE(21b,.L_fixup_8b_copy) - _ASM_EXTABLE(30b,.L_fixup_4b_copy) - _ASM_EXTABLE(31b,.L_fixup_4b_copy) - _ASM_EXTABLE(40b,.L_fixup_1b_copy) - _ASM_EXTABLE(41b,.L_fixup_1b_copy) + _ASM_EXTABLE_UA(1b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(2b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(3b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(4b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(5b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(6b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(7b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(8b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(9b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(10b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(11b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(12b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(13b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(14b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(15b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(16b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(20b, .L_fixup_8b_copy) + _ASM_EXTABLE_UA(21b, .L_fixup_8b_copy) + _ASM_EXTABLE_UA(30b, .L_fixup_4b_copy) + _ASM_EXTABLE_UA(31b, .L_fixup_4b_copy) + _ASM_EXTABLE_UA(40b, .L_fixup_1b_copy) + _ASM_EXTABLE_UA(41b, .L_fixup_1b_copy) ENDPROC(__copy_user_nocache) EXPORT_SYMBOL(__copy_user_nocache) diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 45a53dfe1859..a4a379e79259 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -31,14 +31,18 @@ .macro source 10: - _ASM_EXTABLE(10b, .Lbad_source) + _ASM_EXTABLE_UA(10b, .Lbad_source) .endm .macro dest 20: - _ASM_EXTABLE(20b, .Lbad_dest) + _ASM_EXTABLE_UA(20b, .Lbad_dest) .endm + /* + * No _ASM_EXTABLE_UA; this is used for intentional prefetch on a + * potentially unmapped kernel address. + */ .macro ignore L=.Lignore 30: _ASM_EXTABLE(30b, \L) diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 49b167f73215..74fdff968ea3 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -132,12 +132,12 @@ bad_get_user_8: END(bad_get_user_8) #endif - _ASM_EXTABLE(1b,bad_get_user) - _ASM_EXTABLE(2b,bad_get_user) - _ASM_EXTABLE(3b,bad_get_user) + _ASM_EXTABLE_UA(1b, bad_get_user) + _ASM_EXTABLE_UA(2b, bad_get_user) + _ASM_EXTABLE_UA(3b, bad_get_user) #ifdef CONFIG_X86_64 - _ASM_EXTABLE(4b,bad_get_user) + _ASM_EXTABLE_UA(4b, bad_get_user) #else - _ASM_EXTABLE(4b,bad_get_user_8) - _ASM_EXTABLE(5b,bad_get_user_8) + _ASM_EXTABLE_UA(4b, bad_get_user_8) + _ASM_EXTABLE_UA(5b, bad_get_user_8) #endif diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 96dce5fe2a35..d2e5c9c39601 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -94,10 +94,10 @@ bad_put_user: EXIT END(bad_put_user) - _ASM_EXTABLE(1b,bad_put_user) - _ASM_EXTABLE(2b,bad_put_user) - _ASM_EXTABLE(3b,bad_put_user) - _ASM_EXTABLE(4b,bad_put_user) + _ASM_EXTABLE_UA(1b, bad_put_user) + _ASM_EXTABLE_UA(2b, bad_put_user) + _ASM_EXTABLE_UA(3b, bad_put_user) + _ASM_EXTABLE_UA(4b, bad_put_user) #ifdef CONFIG_X86_32 - _ASM_EXTABLE(5b,bad_put_user) + _ASM_EXTABLE_UA(5b, bad_put_user) #endif diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 7add8ba06887..71fb58d44d58 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -47,8 +47,8 @@ do { \ "3: lea 0(%2,%0,4),%0\n" \ " jmp 2b\n" \ ".previous\n" \ - _ASM_EXTABLE(0b,3b) \ - _ASM_EXTABLE(1b,2b) \ + _ASM_EXTABLE_UA(0b, 3b) \ + _ASM_EXTABLE_UA(1b, 2b) \ : "=&c"(size), "=&D" (__d0) \ : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \ } while (0) @@ -153,44 +153,44 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size) "101: lea 0(%%eax,%0,4),%0\n" " jmp 100b\n" ".previous\n" - _ASM_EXTABLE(1b,100b) - _ASM_EXTABLE(2b,100b) - _ASM_EXTABLE(3b,100b) - _ASM_EXTABLE(4b,100b) - _ASM_EXTABLE(5b,100b) - _ASM_EXTABLE(6b,100b) - _ASM_EXTABLE(7b,100b) - _ASM_EXTABLE(8b,100b) - _ASM_EXTABLE(9b,100b) - _ASM_EXTABLE(10b,100b) - _ASM_EXTABLE(11b,100b) - _ASM_EXTABLE(12b,100b) - _ASM_EXTABLE(13b,100b) - _ASM_EXTABLE(14b,100b) - _ASM_EXTABLE(15b,100b) - _ASM_EXTABLE(16b,100b) - _ASM_EXTABLE(17b,100b) - _ASM_EXTABLE(18b,100b) - _ASM_EXTABLE(19b,100b) - _ASM_EXTABLE(20b,100b) - _ASM_EXTABLE(21b,100b) - _ASM_EXTABLE(22b,100b) - _ASM_EXTABLE(23b,100b) - _ASM_EXTABLE(24b,100b) - _ASM_EXTABLE(25b,100b) - _ASM_EXTABLE(26b,100b) - _ASM_EXTABLE(27b,100b) - _ASM_EXTABLE(28b,100b) - _ASM_EXTABLE(29b,100b) - _ASM_EXTABLE(30b,100b) - _ASM_EXTABLE(31b,100b) - _ASM_EXTABLE(32b,100b) - _ASM_EXTABLE(33b,100b) - _ASM_EXTABLE(34b,100b) - _ASM_EXTABLE(35b,100b) - _ASM_EXTABLE(36b,100b) - _ASM_EXTABLE(37b,100b) - _ASM_EXTABLE(99b,101b) + _ASM_EXTABLE_UA(1b, 100b) + _ASM_EXTABLE_UA(2b, 100b) + _ASM_EXTABLE_UA(3b, 100b) + _ASM_EXTABLE_UA(4b, 100b) + _ASM_EXTABLE_UA(5b, 100b) + _ASM_EXTABLE_UA(6b, 100b) + _ASM_EXTABLE_UA(7b, 100b) + _ASM_EXTABLE_UA(8b, 100b) + _ASM_EXTABLE_UA(9b, 100b) + _ASM_EXTABLE_UA(10b, 100b) + _ASM_EXTABLE_UA(11b, 100b) + _ASM_EXTABLE_UA(12b, 100b) + _ASM_EXTABLE_UA(13b, 100b) + _ASM_EXTABLE_UA(14b, 100b) + _ASM_EXTABLE_UA(15b, 100b) + _ASM_EXTABLE_UA(16b, 100b) + _ASM_EXTABLE_UA(17b, 100b) + _ASM_EXTABLE_UA(18b, 100b) + _ASM_EXTABLE_UA(19b, 100b) + _ASM_EXTABLE_UA(20b, 100b) + _ASM_EXTABLE_UA(21b, 100b) + _ASM_EXTABLE_UA(22b, 100b) + _ASM_EXTABLE_UA(23b, 100b) + _ASM_EXTABLE_UA(24b, 100b) + _ASM_EXTABLE_UA(25b, 100b) + _ASM_EXTABLE_UA(26b, 100b) + _ASM_EXTABLE_UA(27b, 100b) + _ASM_EXTABLE_UA(28b, 100b) + _ASM_EXTABLE_UA(29b, 100b) + _ASM_EXTABLE_UA(30b, 100b) + _ASM_EXTABLE_UA(31b, 100b) + _ASM_EXTABLE_UA(32b, 100b) + _ASM_EXTABLE_UA(33b, 100b) + _ASM_EXTABLE_UA(34b, 100b) + _ASM_EXTABLE_UA(35b, 100b) + _ASM_EXTABLE_UA(36b, 100b) + _ASM_EXTABLE_UA(37b, 100b) + _ASM_EXTABLE_UA(99b, 101b) : "=&c"(size), "=&D" (d0), "=&S" (d1) : "1"(to), "2"(from), "0"(size) : "eax", "edx", "memory"); @@ -259,26 +259,26 @@ static unsigned long __copy_user_intel_nocache(void *to, "9: lea 0(%%eax,%0,4),%0\n" "16: jmp 8b\n" ".previous\n" - _ASM_EXTABLE(0b,16b) - _ASM_EXTABLE(1b,16b) - _ASM_EXTABLE(2b,16b) - _ASM_EXTABLE(21b,16b) - _ASM_EXTABLE(3b,16b) - _ASM_EXTABLE(31b,16b) - _ASM_EXTABLE(4b,16b) - _ASM_EXTABLE(41b,16b) - _ASM_EXTABLE(10b,16b) - _ASM_EXTABLE(51b,16b) - _ASM_EXTABLE(11b,16b) - _ASM_EXTABLE(61b,16b) - _ASM_EXTABLE(12b,16b) - _ASM_EXTABLE(71b,16b) - _ASM_EXTABLE(13b,16b) - _ASM_EXTABLE(81b,16b) - _ASM_EXTABLE(14b,16b) - _ASM_EXTABLE(91b,16b) - _ASM_EXTABLE(6b,9b) - _ASM_EXTABLE(7b,16b) + _ASM_EXTABLE_UA(0b, 16b) + _ASM_EXTABLE_UA(1b, 16b) + _ASM_EXTABLE_UA(2b, 16b) + _ASM_EXTABLE_UA(21b, 16b) + _ASM_EXTABLE_UA(3b, 16b) + _ASM_EXTABLE_UA(31b, 16b) + _ASM_EXTABLE_UA(4b, 16b) + _ASM_EXTABLE_UA(41b, 16b) + _ASM_EXTABLE_UA(10b, 16b) + _ASM_EXTABLE_UA(51b, 16b) + _ASM_EXTABLE_UA(11b, 16b) + _ASM_EXTABLE_UA(61b, 16b) + _ASM_EXTABLE_UA(12b, 16b) + _ASM_EXTABLE_UA(71b, 16b) + _ASM_EXTABLE_UA(13b, 16b) + _ASM_EXTABLE_UA(81b, 16b) + _ASM_EXTABLE_UA(14b, 16b) + _ASM_EXTABLE_UA(91b, 16b) + _ASM_EXTABLE_UA(6b, 9b) + _ASM_EXTABLE_UA(7b, 16b) : "=&c"(size), "=&D" (d0), "=&S" (d1) : "1"(to), "2"(from), "0"(size) : "eax", "edx", "memory"); @@ -321,9 +321,9 @@ do { \ "3: lea 0(%3,%0,4),%0\n" \ " jmp 2b\n" \ ".previous\n" \ - _ASM_EXTABLE(4b,5b) \ - _ASM_EXTABLE(0b,3b) \ - _ASM_EXTABLE(1b,2b) \ + _ASM_EXTABLE_UA(4b, 5b) \ + _ASM_EXTABLE_UA(0b, 3b) \ + _ASM_EXTABLE_UA(1b, 2b) \ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \ : "3"(size), "0"(size), "1"(to), "2"(from) \ : "memory"); \ diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index c50a1d815a37..1bd837cdc4b1 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -37,8 +37,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size) "3: lea 0(%[size1],%[size8],8),%[size8]\n" " jmp 2b\n" ".previous\n" - _ASM_EXTABLE(0b,3b) - _ASM_EXTABLE(1b,2b) + _ASM_EXTABLE_UA(0b, 3b) + _ASM_EXTABLE_UA(1b, 2b) : [size8] "=&c"(size), [dst] "=&D" (__d0) : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr)); clac(); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 45f5d6cf65ae..6521134057e8 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -8,7 +8,8 @@ #include <asm/kdebug.h> typedef bool (*ex_handler_t)(const struct exception_table_entry *, - struct pt_regs *, int); + struct pt_regs *, int, unsigned long, + unsigned long); static inline unsigned long ex_fixup_addr(const struct exception_table_entry *x) @@ -22,7 +23,9 @@ ex_fixup_handler(const struct exception_table_entry *x) } __visible bool ex_handler_default(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { regs->ip = ex_fixup_addr(fixup); return true; @@ -30,7 +33,9 @@ __visible bool ex_handler_default(const struct exception_table_entry *fixup, EXPORT_SYMBOL(ex_handler_default); __visible bool ex_handler_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { regs->ip = ex_fixup_addr(fixup); regs->ax = trapnr; @@ -43,7 +48,9 @@ EXPORT_SYMBOL_GPL(ex_handler_fault); * result of a refcount inc/dec/add/sub. */ __visible bool ex_handler_refcount(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { /* First unconditionally saturate the refcount. */ *(int *)regs->cx = INT_MIN / 2; @@ -96,7 +103,9 @@ EXPORT_SYMBOL(ex_handler_refcount); * out all the FPU registers) if we can't restore from the task's FPU state. */ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { regs->ip = ex_fixup_addr(fixup); @@ -108,9 +117,79 @@ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_fprestore); +/* Helper to check whether a uaccess fault indicates a kernel bug. */ +static bool bogus_uaccess(struct pt_regs *regs, int trapnr, + unsigned long fault_addr) +{ + /* This is the normal case: #PF with a fault address in userspace. */ + if (trapnr == X86_TRAP_PF && fault_addr < TASK_SIZE_MAX) + return false; + + /* + * This code can be reached for machine checks, but only if the #MC + * handler has already decided that it looks like a candidate for fixup. + * This e.g. happens when attempting to access userspace memory which + * the CPU can't access because of uncorrectable bad memory. + */ + if (trapnr == X86_TRAP_MC) + return false; + + /* + * There are two remaining exception types we might encounter here: + * - #PF for faulting accesses to kernel addresses + * - #GP for faulting accesses to noncanonical addresses + * Complain about anything else. + */ + if (trapnr != X86_TRAP_PF && trapnr != X86_TRAP_GP) { + WARN(1, "unexpected trap %d in uaccess\n", trapnr); + return false; + } + + /* + * This is a faulting memory access in kernel space, on a kernel + * address, in a usercopy function. This can e.g. be caused by improper + * use of helpers like __put_user and by improper attempts to access + * userspace addresses in KERNEL_DS regions. + * The one (semi-)legitimate exception are probe_kernel_{read,write}(), + * which can be invoked from places like kgdb, /dev/mem (for reading) + * and privileged BPF code (for reading). + * The probe_kernel_*() functions set the kernel_uaccess_faults_ok flag + * to tell us that faulting on kernel addresses, and even noncanonical + * addresses, in a userspace accessor does not necessarily imply a + * kernel bug, root might just be doing weird stuff. + */ + if (current->kernel_uaccess_faults_ok) + return false; + + /* This is bad. Refuse the fixup so that we go into die(). */ + if (trapnr == X86_TRAP_PF) { + pr_emerg("BUG: pagefault on kernel address 0x%lx in non-whitelisted uaccess\n", + fault_addr); + } else { + pr_emerg("BUG: GPF in non-whitelisted uaccess (non-canonical address?)\n"); + } + return true; +} + +__visible bool ex_handler_uaccess(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + if (bogus_uaccess(regs, trapnr, fault_addr)) + return false; + regs->ip = ex_fixup_addr(fixup); + return true; +} +EXPORT_SYMBOL(ex_handler_uaccess); + __visible bool ex_handler_ext(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { + if (bogus_uaccess(regs, trapnr, fault_addr)) + return false; /* Special hack for uaccess_err */ current->thread.uaccess_err = 1; regs->ip = ex_fixup_addr(fixup); @@ -119,7 +198,9 @@ __visible bool ex_handler_ext(const struct exception_table_entry *fixup, EXPORT_SYMBOL(ex_handler_ext); __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n", (unsigned int)regs->cx, regs->ip, (void *)regs->ip)) @@ -134,7 +215,9 @@ __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup EXPORT_SYMBOL(ex_handler_rdmsr_unsafe); __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n", (unsigned int)regs->cx, (unsigned int)regs->dx, @@ -148,12 +231,14 @@ __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup EXPORT_SYMBOL(ex_handler_wrmsr_unsafe); __visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { if (static_cpu_has(X86_BUG_NULL_SEG)) asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS)); asm volatile ("mov %0, %%fs" : : "rm" (0)); - return ex_handler_default(fixup, regs, trapnr); + return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr); } EXPORT_SYMBOL(ex_handler_clear_fs); @@ -170,7 +255,8 @@ __visible bool ex_has_fault_handler(unsigned long ip) return handler == ex_handler_fault; } -int fixup_exception(struct pt_regs *regs, int trapnr) +int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, + unsigned long fault_addr) { const struct exception_table_entry *e; ex_handler_t handler; @@ -194,7 +280,7 @@ int fixup_exception(struct pt_regs *regs, int trapnr) return 0; handler = ex_fixup_handler(e); - return handler(e, regs, trapnr); + return handler(e, regs, trapnr, error_code, fault_addr); } extern unsigned int early_recursion_flag; @@ -230,9 +316,9 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) * result in a hard-to-debug panic. * * Keep in mind that not all vectors actually get here. Early - * fage faults, for example, are special. + * page faults, for example, are special. */ - if (fixup_exception(regs, trapnr)) + if (fixup_exception(regs, trapnr, regs->orig_ax, 0)) return; if (fixup_bug(regs, trapnr)) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 47bebfe6efa7..0d45f6debb3a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -16,6 +16,7 @@ #include <linux/prefetch.h> /* prefetchw */ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ +#include <linux/efi.h> /* efi_recover_from_page_fault()*/ #include <linux/mm_types.h> #include <asm/cpufeature.h> /* boot_cpu_has, ... */ @@ -25,6 +26,7 @@ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ #include <asm/mmu_context.h> /* vma_pkey() */ +#include <asm/efi.h> /* efi_recover_from_page_fault()*/ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -44,17 +46,19 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr) static nokprobe_inline int kprobes_fault(struct pt_regs *regs) { - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (kprobes_built_in() && !user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 14)) - ret = 1; - preempt_enable(); - } - - return ret; + if (!kprobes_built_in()) + return 0; + if (user_mode(regs)) + return 0; + /* + * To be potentially processing a kprobe fault and to be allowed to call + * kprobe_running(), we have to be non-preemptible. + */ + if (preemptible()) + return 0; + if (!kprobe_running()) + return 0; + return kprobe_fault_handler(regs, X86_TRAP_PF); } /* @@ -709,7 +713,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, int sig; /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs, X86_TRAP_PF)) { + if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { /* * Any interrupt that takes a fault gets the fixup. This makes * the below recursive fault logic only apply to a faults from @@ -789,6 +793,13 @@ no_context(struct pt_regs *regs, unsigned long error_code, return; /* + * Buggy firmware could access regions which might page fault, try to + * recover from such faults. + */ + if (IS_ENABLED(CONFIG_EFI)) + efi_recover_from_page_fault(address); + + /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 089e78c4effd..59274e2c1ac4 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -115,6 +115,8 @@ static inline void pgd_list_del(pgd_t *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) +#define MAX_UNSHARED_PTRS_PER_PGD \ + max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) @@ -181,6 +183,7 @@ static void pgd_dtor(pgd_t *pgd) * and initialize the kernel pmds here. */ #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD +#define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD /* * We allocate separate PMDs for the kernel part of the user page-table @@ -189,6 +192,7 @@ static void pgd_dtor(pgd_t *pgd) */ #define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \ KERNEL_PGD_PTRS : 0) +#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) { @@ -210,7 +214,9 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 +#define MAX_PREALLOCATED_PMDS 0 #define PREALLOCATED_USER_PMDS 0 +#define MAX_PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) @@ -428,8 +434,8 @@ static inline void _pgd_free(pgd_t *pgd) pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; - pmd_t *u_pmds[PREALLOCATED_USER_PMDS]; - pmd_t *pmds[PREALLOCATED_PMDS]; + pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; + pmd_t *pmds[MAX_PREALLOCATED_PMDS]; pgd = _pgd_alloc(); diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c index 034813d4ab1e..6cb6076223ba 100644 --- a/arch/x86/platform/atom/punit_atom_debug.c +++ b/arch/x86/platform/atom/punit_atom_debug.c @@ -115,7 +115,7 @@ static struct dentry *punit_dbg_file; static int punit_dbgfs_register(struct punit_device *punit_device) { - static struct dentry *dev_state; + struct dentry *dev_state; punit_dbg_file = debugfs_create_dir("punit_atom", NULL); if (!punit_dbg_file) @@ -143,8 +143,8 @@ static void punit_dbgfs_unregister(void) (kernel_ulong_t)&drv_data } static const struct x86_cpu_id intel_punit_cpu_ids[] = { - ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt), - ICPU(INTEL_FAM6_ATOM_MERRIFIELD, punit_device_tng), + ICPU(INTEL_FAM6_ATOM_SILVERMONT, punit_device_byt), + ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, punit_device_tng), ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht), {} }; diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c index 5fdacb322ceb..7476b3b097e1 100644 --- a/arch/x86/platform/efi/early_printk.c +++ b/arch/x86/platform/efi/early_printk.c @@ -26,12 +26,14 @@ static bool early_efi_keep; */ static __init int early_efi_map_fb(void) { - unsigned long base, size; + u64 base, size; if (!early_efi_keep) return 0; base = boot_params.screen_info.lfb_base; + if (boot_params.screen_info.capabilities & VIDEO_CAPABILITY_64BIT_BASE) + base |= (u64)boot_params.screen_info.ext_lfb_base << 32; size = boot_params.screen_info.lfb_size; efi_fb = ioremap(base, size); @@ -46,9 +48,11 @@ early_initcall(early_efi_map_fb); */ static __ref void *early_efi_map(unsigned long start, unsigned long len) { - unsigned long base; + u64 base; base = boot_params.screen_info.lfb_base; + if (boot_params.screen_info.capabilities & VIDEO_CAPABILITY_64BIT_BASE) + base |= (u64)boot_params.screen_info.ext_lfb_base << 32; if (efi_fb) return (efi_fb + start); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index ee5d08f25ce4..e8da7f492970 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -619,18 +619,16 @@ void __init efi_dump_pagetable(void) /* * Makes the calling thread switch to/from efi_mm context. Can be used - * for SetVirtualAddressMap() i.e. current->active_mm == init_mm as well - * as during efi runtime calls i.e current->active_mm == current_mm. - * We are not mm_dropping()/mm_grabbing() any mm, because we are not - * losing/creating any references. + * in a kernel thread and user context. Preemption needs to remain disabled + * while the EFI-mm is borrowed. mmgrab()/mmdrop() is not used because the mm + * can not change under us. + * It should be ensured that there are no concurent calls to this function. */ void efi_switch_mm(struct mm_struct *mm) { - task_lock(current); efi_scratch.prev_mm = current->active_mm; current->active_mm = mm; switch_mm(efi_scratch.prev_mm, mm, NULL); - task_unlock(current); } #ifdef CONFIG_EFI_MIXED diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 844d31cb8a0c..669babcaf245 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -16,6 +16,7 @@ #include <asm/efi.h> #include <asm/uv/uv.h> #include <asm/cpu_device_id.h> +#include <asm/reboot.h> #define EFI_MIN_RESERVE 5120 @@ -654,3 +655,80 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, } #endif + +/* + * If any access by any efi runtime service causes a page fault, then, + * 1. If it's efi_reset_system(), reboot through BIOS. + * 2. If any other efi runtime service, then + * a. Return error status to the efi caller process. + * b. Disable EFI Runtime Services forever and + * c. Freeze efi_rts_wq and schedule new process. + * + * @return: Returns, if the page fault is not handled. This function + * will never return if the page fault is handled successfully. + */ +void efi_recover_from_page_fault(unsigned long phys_addr) +{ + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + /* + * Make sure that an efi runtime service caused the page fault. + * "efi_mm" cannot be used to check if the page fault had occurred + * in the firmware context because efi=old_map doesn't use efi_pgd. + */ + if (efi_rts_work.efi_rts_id == NONE) + return; + + /* + * Address range 0x0000 - 0x0fff is always mapped in the efi_pgd, so + * page faulting on these addresses isn't expected. + */ + if (phys_addr >= 0x0000 && phys_addr <= 0x0fff) + return; + + /* + * Print stack trace as it might be useful to know which EFI Runtime + * Service is buggy. + */ + WARN(1, FW_BUG "Page fault caused by firmware at PA: 0x%lx\n", + phys_addr); + + /* + * Buggy efi_reset_system() is handled differently from other EFI + * Runtime Services as it doesn't use efi_rts_wq. Although, + * native_machine_emergency_restart() says that machine_real_restart() + * could fail, it's better not to compilcate this fault handler + * because this case occurs *very* rarely and hence could be improved + * on a need by basis. + */ + if (efi_rts_work.efi_rts_id == RESET_SYSTEM) { + pr_info("efi_reset_system() buggy! Reboot through BIOS\n"); + machine_real_restart(MRR_BIOS); + return; + } + + /* + * Before calling EFI Runtime Service, the kernel has switched the + * calling process to efi_mm. Hence, switch back to task_mm. + */ + arch_efi_call_virt_teardown(); + + /* Signal error status to the efi caller process */ + efi_rts_work.status = EFI_ABORTED; + complete(&efi_rts_work.efi_rts_comp); + + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + pr_info("Froze efi_rts_wq and disabled EFI Runtime Services\n"); + + /* + * Call schedule() in an infinite loop, so that any spurious wake ups + * will never run efi_rts_wq again. + */ + for (;;) { + set_current_state(TASK_IDLE); + schedule(); + } + + return; +} diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c index 4392c15ed9e0..dbfc5cf2aa93 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c @@ -10,7 +10,7 @@ * of the License. */ -#include <linux/gpio.h> +#include <linux/gpio/machine.h> #include <linux/platform_device.h> #include <linux/regulator/machine.h> #include <linux/regulator/fixed.h> @@ -43,7 +43,6 @@ static struct fixed_voltage_config bcm43xx_vmmc = { * real voltage and signaling are still 1.8V. */ .microvolts = 2000000, /* 1.8V */ - .gpio = -EINVAL, .startup_delay = 250 * 1000, /* 250ms */ .enable_high = 1, /* active high */ .enabled_at_boot = 0, /* disabled at boot */ @@ -58,11 +57,23 @@ static struct platform_device bcm43xx_vmmc_regulator = { }, }; +static struct gpiod_lookup_table bcm43xx_vmmc_gpio_table = { + .dev_id = "reg-fixed-voltage.0", + .table = { + GPIO_LOOKUP("0000:00:0c.0", -1, NULL, GPIO_ACTIVE_LOW), + {} + }, +}; + static int __init bcm43xx_regulator_register(void) { + struct gpiod_lookup_table *table = &bcm43xx_vmmc_gpio_table; + struct gpiod_lookup *lookup = table->table; int ret; - bcm43xx_vmmc.gpio = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME); + lookup[0].chip_hwnum = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME); + gpiod_add_lookup_table(table); + ret = platform_device_register(&bcm43xx_vmmc_regulator); if (ret) { pr_err("%s: vmmc regulator register failed\n", __func__); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c index 5a0483e7bf66..31dce781364c 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c @@ -68,7 +68,7 @@ static struct bt_sfi_data tng_bt_sfi_data __initdata = { { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (kernel_ulong_t)&ddata } static const struct x86_cpu_id bt_sfi_cpu_ids[] = { - ICPU(INTEL_FAM6_ATOM_MERRIFIELD, tng_bt_sfi_data), + ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, tng_bt_sfi_data), {} }; diff --git a/arch/x86/platform/ts5500/ts5500.c b/arch/x86/platform/ts5500/ts5500.c index fd39301f25ac..7e56fc74093c 100644 --- a/arch/x86/platform/ts5500/ts5500.c +++ b/arch/x86/platform/ts5500/ts5500.c @@ -24,7 +24,6 @@ #include <linux/kernel.h> #include <linux/leds.h> #include <linux/init.h> -#include <linux/platform_data/gpio-ts5500.h> #include <linux/platform_data/max197.h> #include <linux/platform_device.h> #include <linux/slab.h> diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index a4701389562c..37923d715741 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -7,4 +7,4 @@ nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_cpu.o := $(nostackp) obj-$(CONFIG_PM_SLEEP) += cpu.o -obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o +obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o hibernate.o diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c new file mode 100644 index 000000000000..bcddf09b5aa3 --- /dev/null +++ b/arch/x86/power/hibernate.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hibernation support for x86 + * + * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> + * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz> + * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> + */ +#include <linux/gfp.h> +#include <linux/smp.h> +#include <linux/suspend.h> +#include <linux/scatterlist.h> +#include <linux/kdebug.h> + +#include <crypto/hash.h> + +#include <asm/e820/api.h> +#include <asm/init.h> +#include <asm/proto.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/mtrr.h> +#include <asm/sections.h> +#include <asm/suspend.h> +#include <asm/tlbflush.h> + +/* + * Address to jump to in the last phase of restore in order to get to the image + * kernel's text (this value is passed in the image header). + */ +unsigned long restore_jump_address __visible; +unsigned long jump_address_phys; + +/* + * Value of the cr3 register from before the hibernation (this value is passed + * in the image header). + */ +unsigned long restore_cr3 __visible; +unsigned long temp_pgt __visible; +unsigned long relocated_restore_code __visible; + +/** + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn; + unsigned long nosave_end_pfn; + + nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; + nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; + + return pfn >= nosave_begin_pfn && pfn < nosave_end_pfn; +} + + +#define MD5_DIGEST_SIZE 16 + +struct restore_data_record { + unsigned long jump_address; + unsigned long jump_address_phys; + unsigned long cr3; + unsigned long magic; + u8 e820_digest[MD5_DIGEST_SIZE]; +}; + +#if IS_BUILTIN(CONFIG_CRYPTO_MD5) +/** + * get_e820_md5 - calculate md5 according to given e820 table + * + * @table: the e820 table to be calculated + * @buf: the md5 result to be stored to + */ +static int get_e820_md5(struct e820_table *table, void *buf) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + int size; + int ret = 0; + + tfm = crypto_alloc_shash("md5", 0, 0); + if (IS_ERR(tfm)) + return -ENOMEM; + + desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm), + GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto free_tfm; + } + + desc->tfm = tfm; + desc->flags = 0; + + size = offsetof(struct e820_table, entries) + + sizeof(struct e820_entry) * table->nr_entries; + + if (crypto_shash_digest(desc, (u8 *)table, size, buf)) + ret = -EINVAL; + + kzfree(desc); + +free_tfm: + crypto_free_shash(tfm); + return ret; +} + +static int hibernation_e820_save(void *buf) +{ + return get_e820_md5(e820_table_firmware, buf); +} + +static bool hibernation_e820_mismatch(void *buf) +{ + int ret; + u8 result[MD5_DIGEST_SIZE]; + + memset(result, 0, MD5_DIGEST_SIZE); + /* If there is no digest in suspend kernel, let it go. */ + if (!memcmp(result, buf, MD5_DIGEST_SIZE)) + return false; + + ret = get_e820_md5(e820_table_firmware, result); + if (ret) + return true; + + return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false; +} +#else +static int hibernation_e820_save(void *buf) +{ + return 0; +} + +static bool hibernation_e820_mismatch(void *buf) +{ + /* If md5 is not builtin for restore kernel, let it go. */ + return false; +} +#endif + +#ifdef CONFIG_X86_64 +#define RESTORE_MAGIC 0x23456789ABCDEF01UL +#else +#define RESTORE_MAGIC 0x12345678UL +#endif + +/** + * arch_hibernation_header_save - populate the architecture specific part + * of a hibernation image header + * @addr: address to save the data at + */ +int arch_hibernation_header_save(void *addr, unsigned int max_size) +{ + struct restore_data_record *rdr = addr; + + if (max_size < sizeof(struct restore_data_record)) + return -EOVERFLOW; + rdr->magic = RESTORE_MAGIC; + rdr->jump_address = (unsigned long)restore_registers; + rdr->jump_address_phys = __pa_symbol(restore_registers); + + /* + * The restore code fixes up CR3 and CR4 in the following sequence: + * + * [in hibernation asm] + * 1. CR3 <= temporary page tables + * 2. CR4 <= mmu_cr4_features (from the kernel that restores us) + * 3. CR3 <= rdr->cr3 + * 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel) + * [in restore_processor_state()] + * 5. CR4 <= saved CR4 + * 6. CR3 <= saved CR3 + * + * Our mmu_cr4_features has CR4.PCIDE=0, and toggling + * CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so + * rdr->cr3 needs to point to valid page tables but must not + * have any of the PCID bits set. + */ + rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK; + + return hibernation_e820_save(rdr->e820_digest); +} + +/** + * arch_hibernation_header_restore - read the architecture specific data + * from the hibernation image header + * @addr: address to read the data from + */ +int arch_hibernation_header_restore(void *addr) +{ + struct restore_data_record *rdr = addr; + + if (rdr->magic != RESTORE_MAGIC) { + pr_crit("Unrecognized hibernate image header format!\n"); + return -EINVAL; + } + + restore_jump_address = rdr->jump_address; + jump_address_phys = rdr->jump_address_phys; + restore_cr3 = rdr->cr3; + + if (hibernation_e820_mismatch(rdr->e820_digest)) { + pr_crit("Hibernate inconsistent memory map detected!\n"); + return -ENODEV; + } + + return 0; +} + +int relocate_restore_code(void) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + relocated_restore_code = get_safe_page(GFP_ATOMIC); + if (!relocated_restore_code) + return -ENOMEM; + + memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); + + /* Make the page containing the relocated code executable */ + pgd = (pgd_t *)__va(read_cr3_pa()) + + pgd_index(relocated_restore_code); + p4d = p4d_offset(pgd, relocated_restore_code); + if (p4d_large(*p4d)) { + set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); + goto out; + } + pud = pud_offset(p4d, relocated_restore_code); + if (pud_large(*pud)) { + set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); + goto out; + } + pmd = pmd_offset(pud, relocated_restore_code); + if (pmd_large(*pmd)) { + set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); + goto out; + } + pte = pte_offset_kernel(pmd, relocated_restore_code); + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); +out: + __flush_tlb_all(); + return 0; +} diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index afc4ed7b1578..15695e30f982 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -14,9 +14,7 @@ #include <asm/pgtable.h> #include <asm/mmzone.h> #include <asm/sections.h> - -/* Defined in hibernate_asm_32.S */ -extern int restore_image(void); +#include <asm/suspend.h> /* Pointer to the temporary resume page tables */ pgd_t *resume_pg_dir; @@ -145,6 +143,32 @@ static inline void resume_init_first_level_page_table(pgd_t *pg_dir) #endif } +static int set_up_temporary_text_mapping(pgd_t *pgd_base) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_base + pgd_index(restore_jump_address); + + pmd = resume_one_md_table_init(pgd); + if (!pmd) + return -ENOMEM; + + if (boot_cpu_has(X86_FEATURE_PSE)) { + set_pmd(pmd + pmd_index(restore_jump_address), + __pmd((jump_address_phys & PMD_MASK) | pgprot_val(PAGE_KERNEL_LARGE_EXEC))); + } else { + pte = resume_one_page_table_init(pmd); + if (!pte) + return -ENOMEM; + set_pte(pte + pte_index(restore_jump_address), + __pte((jump_address_phys & PAGE_MASK) | pgprot_val(PAGE_KERNEL_EXEC))); + } + + return 0; +} + asmlinkage int swsusp_arch_resume(void) { int error; @@ -154,22 +178,22 @@ asmlinkage int swsusp_arch_resume(void) return -ENOMEM; resume_init_first_level_page_table(resume_pg_dir); + + error = set_up_temporary_text_mapping(resume_pg_dir); + if (error) + return error; + error = resume_physical_mapping_init(resume_pg_dir); if (error) return error; + temp_pgt = __pa(resume_pg_dir); + + error = relocate_restore_code(); + if (error) + return error; + /* We have got enough memory and from now on we cannot recover */ restore_image(); return 0; } - -/* - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index f8e3b668d20b..239f424ccb29 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -26,26 +26,6 @@ #include <asm/suspend.h> #include <asm/tlbflush.h> -/* Defined in hibernate_asm_64.S */ -extern asmlinkage __visible int restore_image(void); - -/* - * Address to jump to in the last phase of restore in order to get to the image - * kernel's text (this value is passed in the image header). - */ -unsigned long restore_jump_address __visible; -unsigned long jump_address_phys; - -/* - * Value of the cr3 register from before the hibernation (this value is passed - * in the image header). - */ -unsigned long restore_cr3 __visible; - -unsigned long temp_level4_pgt __visible; - -unsigned long relocated_restore_code __visible; - static int set_up_temporary_text_mapping(pgd_t *pgd) { pmd_t *pmd; @@ -141,46 +121,7 @@ static int set_up_temporary_mappings(void) return result; } - temp_level4_pgt = __pa(pgd); - return 0; -} - -static int relocate_restore_code(void) -{ - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - relocated_restore_code = get_safe_page(GFP_ATOMIC); - if (!relocated_restore_code) - return -ENOMEM; - - memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); - - /* Make the page containing the relocated code executable */ - pgd = (pgd_t *)__va(read_cr3_pa()) + - pgd_index(relocated_restore_code); - p4d = p4d_offset(pgd, relocated_restore_code); - if (p4d_large(*p4d)) { - set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); - goto out; - } - pud = pud_offset(p4d, relocated_restore_code); - if (pud_large(*pud)) { - set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); - goto out; - } - pmd = pmd_offset(pud, relocated_restore_code); - if (pmd_large(*pmd)) { - set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); - goto out; - } - pte = pte_offset_kernel(pmd, relocated_restore_code); - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); -out: - __flush_tlb_all(); + temp_pgt = __pa(pgd); return 0; } @@ -200,166 +141,3 @@ asmlinkage int swsusp_arch_resume(void) restore_image(); return 0; } - -/* - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} - -#define MD5_DIGEST_SIZE 16 - -struct restore_data_record { - unsigned long jump_address; - unsigned long jump_address_phys; - unsigned long cr3; - unsigned long magic; - u8 e820_digest[MD5_DIGEST_SIZE]; -}; - -#define RESTORE_MAGIC 0x23456789ABCDEF01UL - -#if IS_BUILTIN(CONFIG_CRYPTO_MD5) -/** - * get_e820_md5 - calculate md5 according to given e820 table - * - * @table: the e820 table to be calculated - * @buf: the md5 result to be stored to - */ -static int get_e820_md5(struct e820_table *table, void *buf) -{ - struct crypto_shash *tfm; - struct shash_desc *desc; - int size; - int ret = 0; - - tfm = crypto_alloc_shash("md5", 0, 0); - if (IS_ERR(tfm)) - return -ENOMEM; - - desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm), - GFP_KERNEL); - if (!desc) { - ret = -ENOMEM; - goto free_tfm; - } - - desc->tfm = tfm; - desc->flags = 0; - - size = offsetof(struct e820_table, entries) + - sizeof(struct e820_entry) * table->nr_entries; - - if (crypto_shash_digest(desc, (u8 *)table, size, buf)) - ret = -EINVAL; - - kzfree(desc); - -free_tfm: - crypto_free_shash(tfm); - return ret; -} - -static void hibernation_e820_save(void *buf) -{ - get_e820_md5(e820_table_firmware, buf); -} - -static bool hibernation_e820_mismatch(void *buf) -{ - int ret; - u8 result[MD5_DIGEST_SIZE]; - - memset(result, 0, MD5_DIGEST_SIZE); - /* If there is no digest in suspend kernel, let it go. */ - if (!memcmp(result, buf, MD5_DIGEST_SIZE)) - return false; - - ret = get_e820_md5(e820_table_firmware, result); - if (ret) - return true; - - return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false; -} -#else -static void hibernation_e820_save(void *buf) -{ -} - -static bool hibernation_e820_mismatch(void *buf) -{ - /* If md5 is not builtin for restore kernel, let it go. */ - return false; -} -#endif - -/** - * arch_hibernation_header_save - populate the architecture specific part - * of a hibernation image header - * @addr: address to save the data at - */ -int arch_hibernation_header_save(void *addr, unsigned int max_size) -{ - struct restore_data_record *rdr = addr; - - if (max_size < sizeof(struct restore_data_record)) - return -EOVERFLOW; - rdr->jump_address = (unsigned long)restore_registers; - rdr->jump_address_phys = __pa_symbol(restore_registers); - - /* - * The restore code fixes up CR3 and CR4 in the following sequence: - * - * [in hibernation asm] - * 1. CR3 <= temporary page tables - * 2. CR4 <= mmu_cr4_features (from the kernel that restores us) - * 3. CR3 <= rdr->cr3 - * 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel) - * [in restore_processor_state()] - * 5. CR4 <= saved CR4 - * 6. CR3 <= saved CR3 - * - * Our mmu_cr4_features has CR4.PCIDE=0, and toggling - * CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so - * rdr->cr3 needs to point to valid page tables but must not - * have any of the PCID bits set. - */ - rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK; - - rdr->magic = RESTORE_MAGIC; - - hibernation_e820_save(rdr->e820_digest); - - return 0; -} - -/** - * arch_hibernation_header_restore - read the architecture specific data - * from the hibernation image header - * @addr: address to read the data from - */ -int arch_hibernation_header_restore(void *addr) -{ - struct restore_data_record *rdr = addr; - - restore_jump_address = rdr->jump_address; - jump_address_phys = rdr->jump_address_phys; - restore_cr3 = rdr->cr3; - - if (rdr->magic != RESTORE_MAGIC) { - pr_crit("Unrecognized hibernate image header format!\n"); - return -EINVAL; - } - - if (hibernation_e820_mismatch(rdr->e820_digest)) { - pr_crit("Hibernate inconsistent memory map detected!\n"); - return -ENODEV; - } - - return 0; -} diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index 6e56815e13a0..6fe383002125 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -12,6 +12,7 @@ #include <asm/page_types.h> #include <asm/asm-offsets.h> #include <asm/processor-flags.h> +#include <asm/frame.h> .text @@ -24,13 +25,30 @@ ENTRY(swsusp_arch_suspend) pushfl popl saved_context_eflags + /* save cr3 */ + movl %cr3, %eax + movl %eax, restore_cr3 + + FRAME_BEGIN call swsusp_save + FRAME_END ret +ENDPROC(swsusp_arch_suspend) ENTRY(restore_image) + /* prepare to jump to the image kernel */ + movl restore_jump_address, %ebx + movl restore_cr3, %ebp + movl mmu_cr4_features, %ecx - movl resume_pg_dir, %eax - subl $__PAGE_OFFSET, %eax + + /* jump to relocated restore code */ + movl relocated_restore_code, %eax + jmpl *%eax + +/* code below has been relocated to a safe page */ +ENTRY(core_restore_code) + movl temp_pgt, %eax movl %eax, %cr3 jecxz 1f # cr4 Pentium and higher, skip if zero @@ -49,7 +67,7 @@ copy_loop: movl pbe_address(%edx), %esi movl pbe_orig_address(%edx), %edi - movl $1024, %ecx + movl $(PAGE_SIZE >> 2), %ecx rep movsl @@ -58,10 +76,13 @@ copy_loop: .p2align 4,,7 done: + jmpl *%ebx + + /* code below belongs to the image kernel */ + .align PAGE_SIZE +ENTRY(restore_registers) /* go back to the original page tables */ - movl $swapper_pg_dir, %eax - subl $__PAGE_OFFSET, %eax - movl %eax, %cr3 + movl %ebp, %cr3 movl mmu_cr4_features, %ecx jecxz 1f # cr4 Pentium and higher, skip if zero movl %ecx, %cr4; # turn PGE back on @@ -82,4 +103,8 @@ done: xorl %eax, %eax + /* tell the hibernation core that we've just restored the memory */ + movl %eax, in_suspend + ret +ENDPROC(restore_registers) diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index fd369a6e9ff8..3008baa2fa95 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -59,7 +59,7 @@ ENTRY(restore_image) movq restore_cr3(%rip), %r9 /* prepare to switch to temporary page tables */ - movq temp_level4_pgt(%rip), %rax + movq temp_pgt(%rip), %rax movq mmu_cr4_features(%rip), %rbx /* prepare to copy image data to their original locations */ diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 3a6c8ebc8032..0b08067c45f3 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -196,6 +196,7 @@ static const char *rel_type(unsigned type) #if ELF_BITS == 64 REL_TYPE(R_X86_64_NONE), REL_TYPE(R_X86_64_64), + REL_TYPE(R_X86_64_PC64), REL_TYPE(R_X86_64_PC32), REL_TYPE(R_X86_64_GOT32), REL_TYPE(R_X86_64_PLT32), @@ -782,6 +783,15 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, add_reloc(&relocs32neg, offset); break; + case R_X86_64_PC64: + /* + * Only used by jump labels + */ + if (is_percpu_sym(sym, symname)) + die("Invalid R_X86_64_PC64 relocation against per-CPU symbol %s\n", + symname); + break; + case R_X86_64_32: case R_X86_64_32S: case R_X86_64_64: diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index 548197212a45..413f3519d9a1 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -116,8 +116,7 @@ do { \ #define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */ #define R_X86_64_8 14 /* Direct 8 bit sign extended */ #define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */ - -#define R_X86_64_NUM 16 +#define R_X86_64_PC64 24 /* Place relative 64-bit signed */ /* * This is used to ensure we don't load something for the wrong architecture. diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2eeddd814653..0ca46e03b830 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -5,6 +5,7 @@ #include <linux/kexec.h> #include <linux/slab.h> +#include <xen/xen.h> #include <xen/features.h> #include <xen/page.h> #include <xen/interface/memory.h> diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index c85d1a88f476..2a9025343534 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -11,6 +11,7 @@ #include <asm/xen/interface.h> #include <asm/xen/hypercall.h> +#include <xen/xen.h> #include <xen/interface/memory.h> #include <xen/interface/hvm/start_info.h> diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 33a783c77d96..b99585034dd2 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -23,6 +23,7 @@ #include <linux/io.h> #include <linux/export.h> +#include <xen/xen.h> #include <xen/platform_pci.h> #include "xen-ops.h" diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 95997e6c0696..0972184f3f19 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -3,6 +3,7 @@ #include <linux/interrupt.h> #include <asm/xen/hypercall.h> +#include <xen/xen.h> #include <xen/page.h> #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> |