diff options
Diffstat (limited to 'arch/x86/kernel')
46 files changed, 1413 insertions, 357 deletions
| diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 81bb565f4497..29786c87e864 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -29,10 +29,13 @@ KASAN_SANITIZE_stacktrace.o				:= n  KASAN_SANITIZE_paravirt.o				:= n  OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o	:= y -OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o		:= y  OBJECT_FILES_NON_STANDARD_test_nx.o			:= y  OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o	:= y +ifdef CONFIG_FRAME_POINTER +OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o		:= y +endif +  # If instrumentation of this dir is enabled, boot hangs during first second.  # Probably could be more selective here, but note that files related to irqs,  # boot, dumpstack/stacktrace, etc are either non-interesting or can lead to @@ -112,6 +115,8 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o  obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o  obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o +obj-$(CONFIG_JAILHOUSE_GUEST)	+= jailhouse.o +  obj-$(CONFIG_EISA)		+= eisa.o  obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index f4c463df8b08..ec3a286163c3 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -68,8 +68,9 @@ int acpi_ioapic;  int acpi_strict;  int acpi_disable_cmcff; +/* ACPI SCI override configuration */  u8 acpi_sci_flags __initdata; -int acpi_sci_override_gsi __initdata; +u32 acpi_sci_override_gsi __initdata = INVALID_ACPI_IRQ;  int acpi_skip_timer_override __initdata;  int acpi_use_timer_override __initdata;  int acpi_fix_pin2_polarity __initdata; @@ -112,8 +113,6 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {  	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15  }; -#define	ACPI_INVALID_GSI		INT_MIN -  /*   * This is just a simple wrapper around early_memremap(),   * with sanity checks for phys == 0 and size == 0. @@ -372,7 +371,7 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,  	 * and acpi_isa_irq_to_gsi() may give wrong result.  	 */  	if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi) -		isa_irq_to_gsi[gsi] = ACPI_INVALID_GSI; +		isa_irq_to_gsi[gsi] = INVALID_ACPI_IRQ;  	isa_irq_to_gsi[bus_irq] = gsi;  } @@ -620,24 +619,24 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)  	}  	rc = acpi_get_override_irq(gsi, &trigger, &polarity); -	if (rc == 0) { -		trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; -		polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; -		irq = acpi_register_gsi(NULL, gsi, trigger, polarity); -		if (irq >= 0) { -			*irqp = irq; -			return 0; -		} -	} +	if (rc) +		return rc; -	return -1; +	trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; +	polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; +	irq = acpi_register_gsi(NULL, gsi, trigger, polarity); +	if (irq < 0) +		return irq; + +	*irqp = irq; +	return 0;  }  EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);  int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)  {  	if (isa_irq < nr_legacy_irqs() && -	    isa_irq_to_gsi[isa_irq] != ACPI_INVALID_GSI) { +	    isa_irq_to_gsi[isa_irq] != INVALID_ACPI_IRQ) {  		*gsi = isa_irq_to_gsi[isa_irq];  		return 0;  	} @@ -676,8 +675,7 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,  	mutex_lock(&acpi_ioapic_lock);  	irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);  	/* Don't set up the ACPI SCI because it's already set up */ -	if (irq >= 0 && enable_update_mptable && -	    acpi_gbl_FADT.sci_interrupt != gsi) +	if (irq >= 0 && enable_update_mptable && gsi != acpi_gbl_FADT.sci_interrupt)  		mp_config_acpi_gsi(dev, gsi, trigger, polarity);  	mutex_unlock(&acpi_ioapic_lock);  #endif @@ -1211,8 +1209,9 @@ static int __init acpi_parse_madt_ioapic_entries(void)  	/*  	 * If BIOS did not supply an INT_SRC_OVR for the SCI  	 * pretend we got one so we can set the SCI flags. +	 * But ignore setting up SCI on hardware reduced platforms.  	 */ -	if (!acpi_sci_override_gsi) +	if (acpi_sci_override_gsi == INVALID_ACPI_IRQ && !acpi_gbl_reduced_hardware)  		acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,  				      acpi_gbl_FADT.sci_interrupt); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 7188aea91549..f1915b744052 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -138,6 +138,8 @@ static int __init acpi_sleep_setup(char *str)  			acpi_nvs_nosave_s3();  		if (strncmp(str, "old_ordering", 12) == 0)  			acpi_old_suspend_ordering(); +		if (strncmp(str, "nobl", 4) == 0) +			acpi_sleep_no_blacklist();  		str = strchr(str, ',');  		if (str != NULL)  			str += strspn(str, ", \t"); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index dbaf14d69ebd..30571fdaaf6f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -298,7 +298,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)  	tgt_rip  = next_rip + o_dspl;  	n_dspl = tgt_rip - orig_insn; -	DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl); +	DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);  	if (tgt_rip - orig_insn >= 0) {  		if (n_dspl - 2 <= 127) @@ -344,15 +344,18 @@ done:  static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)  {  	unsigned long flags; +	int i; -	if (instr[0] != 0x90) -		return; +	for (i = 0; i < a->padlen; i++) { +		if (instr[i] != 0x90) +			return; +	}  	local_irq_save(flags);  	add_nops(instr + (a->instrlen - a->padlen), a->padlen);  	local_irq_restore(flags); -	DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", +	DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",  		   instr, a->instrlen - a->padlen, a->padlen);  } @@ -373,7 +376,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,  	u8 *instr, *replacement;  	u8 insnbuf[MAX_PATCH_LEN]; -	DPRINTK("alt table %p -> %p", start, end); +	DPRINTK("alt table %px, -> %px", start, end);  	/*  	 * The scan order should be from start to end. A later scanned  	 * alternative code can overwrite previously scanned alternative code. @@ -397,14 +400,14 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,  			continue;  		} -		DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d", +		DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d",  			a->cpuid >> 5,  			a->cpuid & 0x1f,  			instr, a->instrlen,  			replacement, a->replacementlen, a->padlen); -		DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr); -		DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement); +		DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); +		DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);  		memcpy(insnbuf, replacement, a->replacementlen);  		insnbuf_sz = a->replacementlen; @@ -430,7 +433,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,  				 a->instrlen - a->replacementlen);  			insnbuf_sz += a->instrlen - a->replacementlen;  		} -		DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr); +		DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);  		text_poke_early(instr, insnbuf, insnbuf_sz);  	} diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index f5d92bc3b884..2c4d5ece7456 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -30,6 +30,7 @@  #include <asm/dma.h>  #include <asm/amd_nb.h>  #include <asm/x86_init.h> +#include <linux/crash_dump.h>  /*   * Using 512M as goal, in case kexec will load kernel_big @@ -56,6 +57,33 @@ int fallback_aper_force __initdata;  int fix_aperture __initdata = 1; +#ifdef CONFIG_PROC_VMCORE +/* + * If the first kernel maps the aperture over e820 RAM, the kdump kernel will + * use the same range because it will remain configured in the northbridge. + * Trying to dump this area via /proc/vmcore may crash the machine, so exclude + * it from vmcore. + */ +static unsigned long aperture_pfn_start, aperture_page_count; + +static int gart_oldmem_pfn_is_ram(unsigned long pfn) +{ +	return likely((pfn < aperture_pfn_start) || +		      (pfn >= aperture_pfn_start + aperture_page_count)); +} + +static void exclude_from_vmcore(u64 aper_base, u32 aper_order) +{ +	aperture_pfn_start = aper_base >> PAGE_SHIFT; +	aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT; +	WARN_ON(register_oldmem_pfn_is_ram(&gart_oldmem_pfn_is_ram)); +} +#else +static void exclude_from_vmcore(u64 aper_base, u32 aper_order) +{ +} +#endif +  /* This code runs before the PCI subsystem is initialized, so just     access the northbridge directly. */ @@ -435,8 +463,16 @@ int __init gart_iommu_hole_init(void)  out:  	if (!fix && !fallback_aper_force) { -		if (last_aper_base) +		if (last_aper_base) { +			/* +			 * If this is the kdump kernel, the first kernel +			 * may have allocated the range over its e820 RAM +			 * and fixed up the northbridge +			 */ +			exclude_from_vmcore(last_aper_base, last_aper_order); +  			return 1; +		}  		return 0;  	} @@ -473,6 +509,14 @@ out:  		return 0;  	} +	/* +	 * If this is the kdump kernel _and_ the first kernel did not +	 * configure the aperture in the northbridge, this range may +	 * overlap with the first kernel's memory. We can't access the +	 * range through vmcore even though it should be part of the dump. +	 */ +	exclude_from_vmcore(aper_alloc, aper_order); +  	/* Fix up the north bridges */  	for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {  		int bus, dev_base, dev_limit; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 880441f24146..25ddf02598d2 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1286,6 +1286,55 @@ static int __init apic_intr_mode_select(void)  	return APIC_SYMMETRIC_IO;  } +/* + * An initial setup of the virtual wire mode. + */ +void __init init_bsp_APIC(void) +{ +	unsigned int value; + +	/* +	 * Don't do the setup now if we have a SMP BIOS as the +	 * through-I/O-APIC virtual wire mode might be active. +	 */ +	if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC)) +		return; + +	/* +	 * Do not trust the local APIC being empty at bootup. +	 */ +	clear_local_APIC(); + +	/* +	 * Enable APIC. +	 */ +	value = apic_read(APIC_SPIV); +	value &= ~APIC_VECTOR_MASK; +	value |= APIC_SPIV_APIC_ENABLED; + +#ifdef CONFIG_X86_32 +	/* This bit is reserved on P4/Xeon and should be cleared */ +	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && +	    (boot_cpu_data.x86 == 15)) +		value &= ~APIC_SPIV_FOCUS_DISABLED; +	else +#endif +		value |= APIC_SPIV_FOCUS_DISABLED; +	value |= SPURIOUS_APIC_VECTOR; +	apic_write(APIC_SPIV, value); + +	/* +	 * Set up the virtual wire mode. +	 */ +	apic_write(APIC_LVT0, APIC_DM_EXTINT); +	value = APIC_DM_NMI; +	if (!lapic_is_integrated())		/* 82489DX */ +		value |= APIC_LVT_LEVEL_TRIGGER; +	if (apic_extnmi == APIC_EXTNMI_NONE) +		value |= APIC_LVT_MASKED; +	apic_write(APIC_LVT1, value); +} +  /* Init the interrupt delivery mode for the BSP */  void __init apic_intr_mode_init(void)  { diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 25a87028cb3f..e84c9eb4e5b4 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -19,6 +19,7 @@  #include <asm/smp.h>  #include <asm/apic.h>  #include <asm/ipi.h> +#include <asm/jailhouse_para.h>  #include <linux/acpi.h> @@ -84,12 +85,8 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)  static void flat_send_IPI_allbutself(int vector)  {  	int cpu = smp_processor_id(); -#ifdef	CONFIG_HOTPLUG_CPU -	int hotplug = 1; -#else -	int hotplug = 0; -#endif -	if (hotplug || vector == NMI_VECTOR) { + +	if (IS_ENABLED(CONFIG_HOTPLUG_CPU) || vector == NMI_VECTOR) {  		if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {  			unsigned long mask = cpumask_bits(cpu_online_mask)[0]; @@ -218,6 +215,15 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)  	return 0;  } +static void physflat_init_apic_ldr(void) +{ +	/* +	 * LDR and DFR are not involved in physflat mode, rather: +	 * "In physical destination mode, the destination processor is +	 * specified by its local APIC ID [...]." (Intel SDM, 10.6.2.1) +	 */ +} +  static void physflat_send_IPI_allbutself(int vector)  {  	default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); @@ -230,7 +236,8 @@ static void physflat_send_IPI_all(int vector)  static int physflat_probe(void)  { -	if (apic == &apic_physflat || num_possible_cpus() > 8) +	if (apic == &apic_physflat || num_possible_cpus() > 8 || +	    jailhouse_paravirt())  		return 1;  	return 0; @@ -251,8 +258,7 @@ static struct apic apic_physflat __ro_after_init = {  	.dest_logical			= 0,  	.check_apicid_used		= NULL, -	/* not needed, but shouldn't hurt: */ -	.init_apic_ldr			= flat_init_apic_ldr, +	.init_apic_ldr			= physflat_init_apic_ldr,  	.ioapic_phys_id_map		= NULL,  	.setup_apic_routing		= NULL, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8a7963421460..8ad2e410974f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -800,18 +800,18 @@ static int irq_polarity(int idx)  	/*  	 * Determine IRQ line polarity (high active or low active):  	 */ -	switch (mp_irqs[idx].irqflag & 0x03) { -	case 0: +	switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) { +	case MP_IRQPOL_DEFAULT:  		/* conforms to spec, ie. bus-type dependent polarity */  		if (test_bit(bus, mp_bus_not_pci))  			return default_ISA_polarity(idx);  		else  			return default_PCI_polarity(idx); -	case 1: +	case MP_IRQPOL_ACTIVE_HIGH:  		return IOAPIC_POL_HIGH; -	case 2: +	case MP_IRQPOL_RESERVED:  		pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); -	case 3: +	case MP_IRQPOL_ACTIVE_LOW:  	default: /* Pointless default required due to do gcc stupidity */  		return IOAPIC_POL_LOW;  	} @@ -845,8 +845,8 @@ static int irq_trigger(int idx)  	/*  	 * Determine IRQ trigger mode (edge or level sensitive):  	 */ -	switch ((mp_irqs[idx].irqflag >> 2) & 0x03) { -	case 0: +	switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) { +	case MP_IRQTRIG_DEFAULT:  		/* conforms to spec, ie. bus-type dependent trigger mode */  		if (test_bit(bus, mp_bus_not_pci))  			trigger = default_ISA_trigger(idx); @@ -854,11 +854,11 @@ static int irq_trigger(int idx)  			trigger = default_PCI_trigger(idx);  		/* Take EISA into account */  		return eisa_irq_trigger(idx, bus, trigger); -	case 1: +	case MP_IRQTRIG_EDGE:  		return IOAPIC_EDGE; -	case 2: +	case MP_IRQTRIG_RESERVED:  		pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); -	case 3: +	case MP_IRQTRIG_LEVEL:  	default: /* Pointless default required due to do gcc stupidity */  		return IOAPIC_LEVEL;  	} diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index f8b03bb8e725..3cc471beb50b 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -542,14 +542,17 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,  		err = assign_irq_vector_policy(irqd, info);  		trace_vector_setup(virq + i, false, err); -		if (err) +		if (err) { +			irqd->chip_data = NULL; +			free_apic_chip_data(apicd);  			goto error; +		}  	}  	return 0;  error: -	x86_vector_free_irqs(domain, virq, i + 1); +	x86_vector_free_irqs(domain, virq, i);  	return err;  } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index e1b8e8bf6b3c..46b675aaf20b 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -137,6 +137,8 @@ static int __init early_get_pnodeid(void)  	case UV3_HUB_PART_NUMBER_X:  		uv_min_hub_revision_id += UV3_HUB_REVISION_BASE;  		break; + +	/* Update: UV4A has only a modified revision to indicate HUB fixes */  	case UV4_HUB_PART_NUMBER:  		uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1;  		uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */ @@ -316,6 +318,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)  	} else if (!strcmp(oem_table_id, "UVH")) {  		/* Only UV1 systems: */  		uv_system_type = UV_NON_UNIQUE_APIC; +		x86_platform.legacy.warm_reset = 0;  		__this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift);  		uv_set_apicid_hibit();  		uv_apic = 1; @@ -767,6 +770,7 @@ static __init void map_gru_high(int max_pnode)  		return;  	} +	/* Only UV3 has distributed GRU mode */  	if (is_uv3_hub() && gru.s3.mode) {  		map_gru_distributed(gru.v);  		return; @@ -790,63 +794,61 @@ static __init void map_mmr_high(int max_pnode)  		pr_info("UV: MMR disabled\n");  } -/* - * This commonality works because both 0 & 1 versions of the MMIOH OVERLAY - * and REDIRECT MMR regs are exactly the same on UV3. - */ -struct mmioh_config { -	unsigned long overlay; -	unsigned long redirect; -	char *id; -}; - -static __initdata struct mmioh_config mmiohs[] = { -	{ -		UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR, -		UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR, -		"MMIOH0" -	}, -	{ -		UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR, -		UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR, -		"MMIOH1" -	}, -}; - -/* UV3 & UV4 have identical MMIOH overlay configs */ -static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) +/* UV3/4 have identical MMIOH overlay configs, UV4A is slightly different */ +static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode)  { -	union uv3h_rh_gam_mmioh_overlay_config0_mmr_u overlay; +	unsigned long overlay;  	unsigned long mmr;  	unsigned long base; +	unsigned long nasid_mask; +	unsigned long m_overlay;  	int i, n, shift, m_io, max_io;  	int nasid, lnasid, fi, li;  	char *id; -	id = mmiohs[index].id; -	overlay.v = uv_read_local_mmr(mmiohs[index].overlay); - -	pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n", id, overlay.v, overlay.s3.base, overlay.s3.m_io); -	if (!overlay.s3.enable) { +	if (index == 0) { +		id = "MMIOH0"; +		m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR; +		overlay = uv_read_local_mmr(m_overlay); +		base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK; +		mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR; +		m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK) +			>> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT; +		shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT; +		n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH; +		nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK; +	} else { +		id = "MMIOH1"; +		m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR; +		overlay = uv_read_local_mmr(m_overlay); +		base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK; +		mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR; +		m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK) +			>> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT; +		shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT; +		n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH; +		nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK; +	} +	pr_info("UV: %s overlay 0x%lx base:0x%lx m_io:%d\n", id, overlay, base, m_io); +	if (!(overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK)) {  		pr_info("UV: %s disabled\n", id);  		return;  	} -	shift = UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT; -	base = (unsigned long)overlay.s3.base; -	m_io = overlay.s3.m_io; -	mmr = mmiohs[index].redirect; -	n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH;  	/* Convert to NASID: */  	min_pnode *= 2;  	max_pnode *= 2;  	max_io = lnasid = fi = li = -1;  	for (i = 0; i < n; i++) { -		union uv3h_rh_gam_mmioh_redirect_config0_mmr_u redirect; +		unsigned long m_redirect = mmr + i * 8; +		unsigned long redirect = uv_read_local_mmr(m_redirect); + +		nasid = redirect & nasid_mask; +		if (i == 0) +			pr_info("UV: %s redirect base 0x%lx(@0x%lx) 0x%04x\n", +				id, redirect, m_redirect, nasid); -		redirect.v = uv_read_local_mmr(mmr + i * 8); -		nasid = redirect.s3.nasid;  		/* Invalid NASID: */  		if (nasid < min_pnode || max_pnode < nasid)  			nasid = -1; @@ -894,8 +896,8 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode)  	if (is_uv3_hub() || is_uv4_hub()) {  		/* Map both MMIOH regions: */ -		map_mmioh_high_uv3(0, min_pnode, max_pnode); -		map_mmioh_high_uv3(1, min_pnode, max_pnode); +		map_mmioh_high_uv34(0, min_pnode, max_pnode); +		map_mmioh_high_uv34(1, min_pnode, max_pnode);  		return;  	} diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bcb75dc97d44..ea831c858195 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -829,8 +829,32 @@ static void init_amd(struct cpuinfo_x86 *c)  		set_cpu_cap(c, X86_FEATURE_K8);  	if (cpu_has(c, X86_FEATURE_XMM2)) { -		/* MFENCE stops RDTSC speculation */ -		set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); +		unsigned long long val; +		int ret; + +		/* +		 * A serializing LFENCE has less overhead than MFENCE, so +		 * use it for execution serialization.  On families which +		 * don't have that MSR, LFENCE is already serializing. +		 * msr_set_bit() uses the safe accessors, too, even if the MSR +		 * is not present. +		 */ +		msr_set_bit(MSR_F10H_DECFG, +			    MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + +		/* +		 * Verify that the MSR write was successful (could be running +		 * under a hypervisor) and only then assume that LFENCE is +		 * serializing. +		 */ +		ret = rdmsrl_safe(MSR_F10H_DECFG, &val); +		if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { +			/* A serializing LFENCE stops RDTSC speculation */ +			set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +		} else { +			/* MFENCE stops RDTSC speculation */ +			set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); +		}  	}  	/* diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ba0b2424c9b0..3bfb2b23d79c 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -10,6 +10,11 @@   */  #include <linux/init.h>  #include <linux/utsname.h> +#include <linux/cpu.h> +#include <linux/module.h> + +#include <asm/nospec-branch.h> +#include <asm/cmdline.h>  #include <asm/bugs.h>  #include <asm/processor.h>  #include <asm/processor-flags.h> @@ -19,6 +24,9 @@  #include <asm/alternative.h>  #include <asm/pgtable.h>  #include <asm/set_memory.h> +#include <asm/intel-family.h> + +static void __init spectre_v2_select_mitigation(void);  void __init check_bugs(void)  { @@ -29,6 +37,9 @@ void __init check_bugs(void)  		print_cpu_info(&boot_cpu_data);  	} +	/* Select the proper spectre mitigation before patching alternatives */ +	spectre_v2_select_mitigation(); +  #ifdef CONFIG_X86_32  	/*  	 * Check whether we are able to run this kernel safely on SMP. @@ -60,3 +71,249 @@ void __init check_bugs(void)  		set_memory_4k((unsigned long)__va(0), 1);  #endif  } + +/* The kernel command line selection */ +enum spectre_v2_mitigation_cmd { +	SPECTRE_V2_CMD_NONE, +	SPECTRE_V2_CMD_AUTO, +	SPECTRE_V2_CMD_FORCE, +	SPECTRE_V2_CMD_RETPOLINE, +	SPECTRE_V2_CMD_RETPOLINE_GENERIC, +	SPECTRE_V2_CMD_RETPOLINE_AMD, +}; + +static const char *spectre_v2_strings[] = { +	[SPECTRE_V2_NONE]			= "Vulnerable", +	[SPECTRE_V2_RETPOLINE_MINIMAL]		= "Vulnerable: Minimal generic ASM retpoline", +	[SPECTRE_V2_RETPOLINE_MINIMAL_AMD]	= "Vulnerable: Minimal AMD ASM retpoline", +	[SPECTRE_V2_RETPOLINE_GENERIC]		= "Mitigation: Full generic retpoline", +	[SPECTRE_V2_RETPOLINE_AMD]		= "Mitigation: Full AMD retpoline", +}; + +#undef pr_fmt +#define pr_fmt(fmt)     "Spectre V2 : " fmt + +static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; + +#ifdef RETPOLINE +static bool spectre_v2_bad_module; + +bool retpoline_module_ok(bool has_retpoline) +{ +	if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline) +		return true; + +	pr_err("System may be vunerable to spectre v2\n"); +	spectre_v2_bad_module = true; +	return false; +} + +static inline const char *spectre_v2_module_string(void) +{ +	return spectre_v2_bad_module ? " - vulnerable module loaded" : ""; +} +#else +static inline const char *spectre_v2_module_string(void) { return ""; } +#endif + +static void __init spec2_print_if_insecure(const char *reason) +{ +	if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) +		pr_info("%s\n", reason); +} + +static void __init spec2_print_if_secure(const char *reason) +{ +	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) +		pr_info("%s\n", reason); +} + +static inline bool retp_compiler(void) +{ +	return __is_defined(RETPOLINE); +} + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ +	int len = strlen(opt); + +	return len == arglen && !strncmp(arg, opt, len); +} + +static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +{ +	char arg[20]; +	int ret; + +	ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, +				  sizeof(arg)); +	if (ret > 0)  { +		if (match_option(arg, ret, "off")) { +			goto disable; +		} else if (match_option(arg, ret, "on")) { +			spec2_print_if_secure("force enabled on command line."); +			return SPECTRE_V2_CMD_FORCE; +		} else if (match_option(arg, ret, "retpoline")) { +			spec2_print_if_insecure("retpoline selected on command line."); +			return SPECTRE_V2_CMD_RETPOLINE; +		} else if (match_option(arg, ret, "retpoline,amd")) { +			if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { +				pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); +				return SPECTRE_V2_CMD_AUTO; +			} +			spec2_print_if_insecure("AMD retpoline selected on command line."); +			return SPECTRE_V2_CMD_RETPOLINE_AMD; +		} else if (match_option(arg, ret, "retpoline,generic")) { +			spec2_print_if_insecure("generic retpoline selected on command line."); +			return SPECTRE_V2_CMD_RETPOLINE_GENERIC; +		} else if (match_option(arg, ret, "auto")) { +			return SPECTRE_V2_CMD_AUTO; +		} +	} + +	if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2")) +		return SPECTRE_V2_CMD_AUTO; +disable: +	spec2_print_if_insecure("disabled on command line."); +	return SPECTRE_V2_CMD_NONE; +} + +/* Check for Skylake-like CPUs (for RSB handling) */ +static bool __init is_skylake_era(void) +{ +	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && +	    boot_cpu_data.x86 == 6) { +		switch (boot_cpu_data.x86_model) { +		case INTEL_FAM6_SKYLAKE_MOBILE: +		case INTEL_FAM6_SKYLAKE_DESKTOP: +		case INTEL_FAM6_SKYLAKE_X: +		case INTEL_FAM6_KABYLAKE_MOBILE: +		case INTEL_FAM6_KABYLAKE_DESKTOP: +			return true; +		} +	} +	return false; +} + +static void __init spectre_v2_select_mitigation(void) +{ +	enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); +	enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + +	/* +	 * If the CPU is not affected and the command line mode is NONE or AUTO +	 * then nothing to do. +	 */ +	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && +	    (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) +		return; + +	switch (cmd) { +	case SPECTRE_V2_CMD_NONE: +		return; + +	case SPECTRE_V2_CMD_FORCE: +		/* FALLTRHU */ +	case SPECTRE_V2_CMD_AUTO: +		goto retpoline_auto; + +	case SPECTRE_V2_CMD_RETPOLINE_AMD: +		if (IS_ENABLED(CONFIG_RETPOLINE)) +			goto retpoline_amd; +		break; +	case SPECTRE_V2_CMD_RETPOLINE_GENERIC: +		if (IS_ENABLED(CONFIG_RETPOLINE)) +			goto retpoline_generic; +		break; +	case SPECTRE_V2_CMD_RETPOLINE: +		if (IS_ENABLED(CONFIG_RETPOLINE)) +			goto retpoline_auto; +		break; +	} +	pr_err("kernel not compiled with retpoline; no mitigation available!"); +	return; + +retpoline_auto: +	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { +	retpoline_amd: +		if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { +			pr_err("LFENCE not serializing. Switching to generic retpoline\n"); +			goto retpoline_generic; +		} +		mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD : +					 SPECTRE_V2_RETPOLINE_MINIMAL_AMD; +		setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); +		setup_force_cpu_cap(X86_FEATURE_RETPOLINE); +	} else { +	retpoline_generic: +		mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC : +					 SPECTRE_V2_RETPOLINE_MINIMAL; +		setup_force_cpu_cap(X86_FEATURE_RETPOLINE); +	} + +	spectre_v2_enabled = mode; +	pr_info("%s\n", spectre_v2_strings[mode]); + +	/* +	 * If neither SMEP or KPTI are available, there is a risk of +	 * hitting userspace addresses in the RSB after a context switch +	 * from a shallow call stack to a deeper one. To prevent this fill +	 * the entire RSB, even when using IBRS. +	 * +	 * Skylake era CPUs have a separate issue with *underflow* of the +	 * RSB, when they will predict 'ret' targets from the generic BTB. +	 * The proper mitigation for this is IBRS. If IBRS is not supported +	 * or deactivated in favour of retpolines the RSB fill on context +	 * switch is required. +	 */ +	if ((!boot_cpu_has(X86_FEATURE_PTI) && +	     !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { +		setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); +		pr_info("Filling RSB on context switch\n"); +	} + +	/* Initialize Indirect Branch Prediction Barrier if supported */ +	if (boot_cpu_has(X86_FEATURE_IBPB)) { +		setup_force_cpu_cap(X86_FEATURE_USE_IBPB); +		pr_info("Enabling Indirect Branch Prediction Barrier\n"); +	} +} + +#undef pr_fmt + +#ifdef CONFIG_SYSFS +ssize_t cpu_show_meltdown(struct device *dev, +			  struct device_attribute *attr, char *buf) +{ +	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) +		return sprintf(buf, "Not affected\n"); +	if (boot_cpu_has(X86_FEATURE_PTI)) +		return sprintf(buf, "Mitigation: PTI\n"); +	return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spectre_v1(struct device *dev, +			    struct device_attribute *attr, char *buf) +{ +	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) +		return sprintf(buf, "Not affected\n"); +	return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spectre_v2(struct device *dev, +			    struct device_attribute *attr, char *buf) +{ +	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) +		return sprintf(buf, "Not affected\n"); + +	return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], +		       boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", +		       spectre_v2_module_string()); +} +#endif + +void __ibp_barrier(void) +{ +	__wrmsr(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, 0); +} +EXPORT_SYMBOL_GPL(__ibp_barrier); diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 68bc6d9b3132..c578cd29c2d2 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -106,6 +106,10 @@ static void early_init_centaur(struct cpuinfo_x86 *c)  #ifdef CONFIG_X86_64  	set_cpu_cap(c, X86_FEATURE_SYSENTER32);  #endif +	if (c->x86_power & (1 << 8)) { +		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); +	}  }  static void init_centaur(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 39d7ea865207..c7c996a692fd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -47,6 +47,8 @@  #include <asm/pat.h>  #include <asm/microcode.h>  #include <asm/microcode_intel.h> +#include <asm/intel-family.h> +#include <asm/cpu_device_id.h>  #ifdef CONFIG_X86_LOCAL_APIC  #include <asm/uv/uv.h> @@ -769,6 +771,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)  		cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);  		c->x86_capability[CPUID_7_0_EBX] = ebx;  		c->x86_capability[CPUID_7_ECX] = ecx; +		c->x86_capability[CPUID_7_EDX] = edx;  	}  	/* Extended state features: level 0x0000000d */ @@ -876,6 +879,41 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)  #endif  } +static const __initdata struct x86_cpu_id cpu_no_speculation[] = { +	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_CEDARVIEW,	X86_FEATURE_ANY }, +	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_CLOVERVIEW,	X86_FEATURE_ANY }, +	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_LINCROFT,	X86_FEATURE_ANY }, +	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_PENWELL,	X86_FEATURE_ANY }, +	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_PINEVIEW,	X86_FEATURE_ANY }, +	{ X86_VENDOR_CENTAUR,	5 }, +	{ X86_VENDOR_INTEL,	5 }, +	{ X86_VENDOR_NSC,	5 }, +	{ X86_VENDOR_ANY,	4 }, +	{} +}; + +static const __initdata struct x86_cpu_id cpu_no_meltdown[] = { +	{ X86_VENDOR_AMD }, +	{} +}; + +static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c) +{ +	u64 ia32_cap = 0; + +	if (x86_match_cpu(cpu_no_meltdown)) +		return false; + +	if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) +		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + +	/* Rogue Data Cache Load? No! */ +	if (ia32_cap & ARCH_CAP_RDCL_NO) +		return false; + +	return true; +} +  /*   * Do minimum CPU detection early.   * Fields really needed: vendor, cpuid_level, family, model, mask, @@ -923,8 +961,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)  	setup_force_cpu_cap(X86_FEATURE_ALWAYS); -	if (c->x86_vendor != X86_VENDOR_AMD) -		setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); +	if (!x86_match_cpu(cpu_no_speculation)) { +		if (cpu_vulnerable_to_meltdown(c)) +			setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); +		setup_force_cpu_bug(X86_BUG_SPECTRE_V1); +		setup_force_cpu_bug(X86_BUG_SPECTRE_V2); +	}  	fpu__init_system(c); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index bea8d3e24f50..479ca4728de0 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -31,6 +31,7 @@ extern const struct hypervisor_x86 x86_hyper_ms_hyperv;  extern const struct hypervisor_x86 x86_hyper_xen_pv;  extern const struct hypervisor_x86 x86_hyper_xen_hvm;  extern const struct hypervisor_x86 x86_hyper_kvm; +extern const struct hypervisor_x86 x86_hyper_jailhouse;  static const __initconst struct hypervisor_x86 * const hypervisors[] =  { @@ -45,6 +46,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =  #ifdef CONFIG_KVM_GUEST  	&x86_hyper_kvm,  #endif +#ifdef CONFIG_JAILHOUSE_GUEST +	&x86_hyper_jailhouse, +#endif  };  enum x86_hypervisor_type x86_hyper_type; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index b1af22073e28..6936d14d4c77 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -102,6 +102,59 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)  		ELF_HWCAP2 |= HWCAP2_RING3MWAIT;  } +/* + * Early microcode releases for the Spectre v2 mitigation were broken. + * Information taken from; + * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/microcode-update-guidance.pdf + * - https://kb.vmware.com/s/article/52345 + * - Microcode revisions observed in the wild + * - Release note from 20180108 microcode release + */ +struct sku_microcode { +	u8 model; +	u8 stepping; +	u32 microcode; +}; +static const struct sku_microcode spectre_bad_microcodes[] = { +	{ INTEL_FAM6_KABYLAKE_DESKTOP,	0x0B,	0x84 }, +	{ INTEL_FAM6_KABYLAKE_DESKTOP,	0x0A,	0x84 }, +	{ INTEL_FAM6_KABYLAKE_DESKTOP,	0x09,	0x84 }, +	{ INTEL_FAM6_KABYLAKE_MOBILE,	0x0A,	0x84 }, +	{ INTEL_FAM6_KABYLAKE_MOBILE,	0x09,	0x84 }, +	{ INTEL_FAM6_SKYLAKE_X,		0x03,	0x0100013e }, +	{ INTEL_FAM6_SKYLAKE_X,		0x04,	0x0200003c }, +	{ INTEL_FAM6_SKYLAKE_MOBILE,	0x03,	0xc2 }, +	{ INTEL_FAM6_SKYLAKE_DESKTOP,	0x03,	0xc2 }, +	{ INTEL_FAM6_BROADWELL_CORE,	0x04,	0x28 }, +	{ INTEL_FAM6_BROADWELL_GT3E,	0x01,	0x1b }, +	{ INTEL_FAM6_BROADWELL_XEON_D,	0x02,	0x14 }, +	{ INTEL_FAM6_BROADWELL_XEON_D,	0x03,	0x07000011 }, +	{ INTEL_FAM6_BROADWELL_X,	0x01,	0x0b000025 }, +	{ INTEL_FAM6_HASWELL_ULT,	0x01,	0x21 }, +	{ INTEL_FAM6_HASWELL_GT3E,	0x01,	0x18 }, +	{ INTEL_FAM6_HASWELL_CORE,	0x03,	0x23 }, +	{ INTEL_FAM6_HASWELL_X,		0x02,	0x3b }, +	{ INTEL_FAM6_HASWELL_X,		0x04,	0x10 }, +	{ INTEL_FAM6_IVYBRIDGE_X,	0x04,	0x42a }, +	/* Updated in the 20180108 release; blacklist until we know otherwise */ +	{ INTEL_FAM6_ATOM_GEMINI_LAKE,	0x01,	0x22 }, +	/* Observed in the wild */ +	{ INTEL_FAM6_SANDYBRIDGE_X,	0x06,	0x61b }, +	{ INTEL_FAM6_SANDYBRIDGE_X,	0x07,	0x712 }, +}; + +static bool bad_spectre_microcode(struct cpuinfo_x86 *c) +{ +	int i; + +	for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { +		if (c->x86_model == spectre_bad_microcodes[i].model && +		    c->x86_mask == spectre_bad_microcodes[i].stepping) +			return (c->microcode <= spectre_bad_microcodes[i].microcode); +	} +	return false; +} +  static void early_init_intel(struct cpuinfo_x86 *c)  {  	u64 misc_enable; @@ -123,6 +176,30 @@ static void early_init_intel(struct cpuinfo_x86 *c)  		c->microcode = intel_get_microcode_revision();  	/* +	 * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support, +	 * and they also have a different bit for STIBP support. Also, +	 * a hypervisor might have set the individual AMD bits even on +	 * Intel CPUs, for finer-grained selection of what's available. +	 */ +	if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { +		set_cpu_cap(c, X86_FEATURE_IBRS); +		set_cpu_cap(c, X86_FEATURE_IBPB); +	} +	if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) +		set_cpu_cap(c, X86_FEATURE_STIBP); + +	/* Now if any of them are set, check the blacklist and clear the lot */ +	if ((cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) || +	     cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) { +		pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n"); +		clear_cpu_cap(c, X86_FEATURE_IBRS); +		clear_cpu_cap(c, X86_FEATURE_IBPB); +		clear_cpu_cap(c, X86_FEATURE_STIBP); +		clear_cpu_cap(c, X86_FEATURE_SPEC_CTRL); +		clear_cpu_cap(c, X86_FEATURE_INTEL_STIBP); +	} + +	/*  	 * Atom erratum AAE44/AAF40/AAG38/AAH41:  	 *  	 * A race condition between speculative fetches and invalidating diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 88dcf8479013..410629f10ad3 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -135,6 +135,40 @@ struct rdt_resource rdt_resources_all[] = {  		.format_str		= "%d=%0*x",  		.fflags			= RFTYPE_RES_CACHE,  	}, +	[RDT_RESOURCE_L2DATA] = +	{ +		.rid			= RDT_RESOURCE_L2DATA, +		.name			= "L2DATA", +		.domains		= domain_init(RDT_RESOURCE_L2DATA), +		.msr_base		= IA32_L2_CBM_BASE, +		.msr_update		= cat_wrmsr, +		.cache_level		= 2, +		.cache = { +			.min_cbm_bits	= 1, +			.cbm_idx_mult	= 2, +			.cbm_idx_offset	= 0, +		}, +		.parse_ctrlval		= parse_cbm, +		.format_str		= "%d=%0*x", +		.fflags			= RFTYPE_RES_CACHE, +	}, +	[RDT_RESOURCE_L2CODE] = +	{ +		.rid			= RDT_RESOURCE_L2CODE, +		.name			= "L2CODE", +		.domains		= domain_init(RDT_RESOURCE_L2CODE), +		.msr_base		= IA32_L2_CBM_BASE, +		.msr_update		= cat_wrmsr, +		.cache_level		= 2, +		.cache = { +			.min_cbm_bits	= 1, +			.cbm_idx_mult	= 2, +			.cbm_idx_offset	= 1, +		}, +		.parse_ctrlval		= parse_cbm, +		.format_str		= "%d=%0*x", +		.fflags			= RFTYPE_RES_CACHE, +	},  	[RDT_RESOURCE_MBA] =  	{  		.rid			= RDT_RESOURCE_MBA, @@ -259,15 +293,15 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)  	r->alloc_enabled = true;  } -static void rdt_get_cdp_l3_config(int type) +static void rdt_get_cdp_config(int level, int type)  { -	struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; +	struct rdt_resource *r_l = &rdt_resources_all[level];  	struct rdt_resource *r = &rdt_resources_all[type]; -	r->num_closid = r_l3->num_closid / 2; -	r->cache.cbm_len = r_l3->cache.cbm_len; -	r->default_ctrl = r_l3->default_ctrl; -	r->cache.shareable_bits = r_l3->cache.shareable_bits; +	r->num_closid = r_l->num_closid / 2; +	r->cache.cbm_len = r_l->cache.cbm_len; +	r->default_ctrl = r_l->default_ctrl; +	r->cache.shareable_bits = r_l->cache.shareable_bits;  	r->data_width = (r->cache.cbm_len + 3) / 4;  	r->alloc_capable = true;  	/* @@ -277,6 +311,18 @@ static void rdt_get_cdp_l3_config(int type)  	r->alloc_enabled = false;  } +static void rdt_get_cdp_l3_config(void) +{ +	rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA); +	rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3CODE); +} + +static void rdt_get_cdp_l2_config(void) +{ +	rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA); +	rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE); +} +  static int get_cache_id(int cpu, int level)  {  	struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); @@ -525,10 +571,6 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)  		 */  		if (static_branch_unlikely(&rdt_mon_enable_key))  			rmdir_mondata_subdir_allrdtgrp(r, d->id); -		kfree(d->ctrl_val); -		kfree(d->rmid_busy_llc); -		kfree(d->mbm_total); -		kfree(d->mbm_local);  		list_del(&d->list);  		if (is_mbm_enabled())  			cancel_delayed_work(&d->mbm_over); @@ -545,6 +587,10 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)  			cancel_delayed_work(&d->cqm_limbo);  		} +		kfree(d->ctrl_val); +		kfree(d->rmid_busy_llc); +		kfree(d->mbm_total); +		kfree(d->mbm_local);  		kfree(d);  		return;  	} @@ -645,6 +691,7 @@ enum {  	RDT_FLAG_L3_CAT,  	RDT_FLAG_L3_CDP,  	RDT_FLAG_L2_CAT, +	RDT_FLAG_L2_CDP,  	RDT_FLAG_MBA,  }; @@ -667,6 +714,7 @@ static struct rdt_options rdt_options[]  __initdata = {  	RDT_OPT(RDT_FLAG_L3_CAT,    "l3cat",	X86_FEATURE_CAT_L3),  	RDT_OPT(RDT_FLAG_L3_CDP,    "l3cdp",	X86_FEATURE_CDP_L3),  	RDT_OPT(RDT_FLAG_L2_CAT,    "l2cat",	X86_FEATURE_CAT_L2), +	RDT_OPT(RDT_FLAG_L2_CDP,    "l2cdp",	X86_FEATURE_CDP_L2),  	RDT_OPT(RDT_FLAG_MBA,	    "mba",	X86_FEATURE_MBA),  };  #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) @@ -729,15 +777,15 @@ static __init bool get_rdt_alloc_resources(void)  	if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {  		rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]); -		if (rdt_cpu_has(X86_FEATURE_CDP_L3)) { -			rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); -			rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); -		} +		if (rdt_cpu_has(X86_FEATURE_CDP_L3)) +			rdt_get_cdp_l3_config();  		ret = true;  	}  	if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {  		/* CPUID 0x10.2 fields are same format at 0x10.1 */  		rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]); +		if (rdt_cpu_has(X86_FEATURE_CDP_L2)) +			rdt_get_cdp_l2_config();  		ret = true;  	} diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 3397244984f5..3fd7a70ee04a 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -7,12 +7,15 @@  #include <linux/jump_label.h>  #define IA32_L3_QOS_CFG		0xc81 +#define IA32_L2_QOS_CFG		0xc82  #define IA32_L3_CBM_BASE	0xc90  #define IA32_L2_CBM_BASE	0xd10  #define IA32_MBA_THRTL_BASE	0xd50  #define L3_QOS_CDP_ENABLE	0x01ULL +#define L2_QOS_CDP_ENABLE	0x01ULL +  /*   * Event IDs are used to program IA32_QM_EVTSEL before reading event   * counter from IA32_QM_CTR @@ -357,6 +360,8 @@ enum {  	RDT_RESOURCE_L3DATA,  	RDT_RESOURCE_L3CODE,  	RDT_RESOURCE_L2, +	RDT_RESOURCE_L2DATA, +	RDT_RESOURCE_L2CODE,  	RDT_RESOURCE_MBA,  	/* Must be the last */ diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 64c5ff97ee0d..bdab7d2f51af 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -990,6 +990,7 @@ out_destroy:  	kernfs_remove(kn);  	return ret;  } +  static void l3_qos_cfg_update(void *arg)  {  	bool *enable = arg; @@ -997,8 +998,17 @@ static void l3_qos_cfg_update(void *arg)  	wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);  } -static int set_l3_qos_cfg(struct rdt_resource *r, bool enable) +static void l2_qos_cfg_update(void *arg)  { +	bool *enable = arg; + +	wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); +} + +static int set_cache_qos_cfg(int level, bool enable) +{ +	void (*update)(void *arg); +	struct rdt_resource *r_l;  	cpumask_var_t cpu_mask;  	struct rdt_domain *d;  	int cpu; @@ -1006,16 +1016,24 @@ static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)  	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))  		return -ENOMEM; -	list_for_each_entry(d, &r->domains, list) { +	if (level == RDT_RESOURCE_L3) +		update = l3_qos_cfg_update; +	else if (level == RDT_RESOURCE_L2) +		update = l2_qos_cfg_update; +	else +		return -EINVAL; + +	r_l = &rdt_resources_all[level]; +	list_for_each_entry(d, &r_l->domains, list) {  		/* Pick one CPU from each domain instance to update MSR */  		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);  	}  	cpu = get_cpu();  	/* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */  	if (cpumask_test_cpu(cpu, cpu_mask)) -		l3_qos_cfg_update(&enable); +		update(&enable);  	/* Update QOS_CFG MSR on all other cpus in cpu_mask. */ -	smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1); +	smp_call_function_many(cpu_mask, update, &enable, 1);  	put_cpu();  	free_cpumask_var(cpu_mask); @@ -1023,52 +1041,99 @@ static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)  	return 0;  } -static int cdp_enable(void) +static int cdp_enable(int level, int data_type, int code_type)  { -	struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA]; -	struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE]; -	struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; +	struct rdt_resource *r_ldata = &rdt_resources_all[data_type]; +	struct rdt_resource *r_lcode = &rdt_resources_all[code_type]; +	struct rdt_resource *r_l = &rdt_resources_all[level];  	int ret; -	if (!r_l3->alloc_capable || !r_l3data->alloc_capable || -	    !r_l3code->alloc_capable) +	if (!r_l->alloc_capable || !r_ldata->alloc_capable || +	    !r_lcode->alloc_capable)  		return -EINVAL; -	ret = set_l3_qos_cfg(r_l3, true); +	ret = set_cache_qos_cfg(level, true);  	if (!ret) { -		r_l3->alloc_enabled = false; -		r_l3data->alloc_enabled = true; -		r_l3code->alloc_enabled = true; +		r_l->alloc_enabled = false; +		r_ldata->alloc_enabled = true; +		r_lcode->alloc_enabled = true;  	}  	return ret;  } -static void cdp_disable(void) +static int cdpl3_enable(void)  { -	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; +	return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, +			  RDT_RESOURCE_L3CODE); +} + +static int cdpl2_enable(void) +{ +	return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, +			  RDT_RESOURCE_L2CODE); +} + +static void cdp_disable(int level, int data_type, int code_type) +{ +	struct rdt_resource *r = &rdt_resources_all[level];  	r->alloc_enabled = r->alloc_capable; -	if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) { -		rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false; -		rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false; -		set_l3_qos_cfg(r, false); +	if (rdt_resources_all[data_type].alloc_enabled) { +		rdt_resources_all[data_type].alloc_enabled = false; +		rdt_resources_all[code_type].alloc_enabled = false; +		set_cache_qos_cfg(level, false);  	}  } +static void cdpl3_disable(void) +{ +	cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE); +} + +static void cdpl2_disable(void) +{ +	cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE); +} + +static void cdp_disable_all(void) +{ +	if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) +		cdpl3_disable(); +	if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) +		cdpl2_disable(); +} +  static int parse_rdtgroupfs_options(char *data)  {  	char *token, *o = data;  	int ret = 0;  	while ((token = strsep(&o, ",")) != NULL) { -		if (!*token) -			return -EINVAL; +		if (!*token) { +			ret = -EINVAL; +			goto out; +		} -		if (!strcmp(token, "cdp")) -			ret = cdp_enable(); +		if (!strcmp(token, "cdp")) { +			ret = cdpl3_enable(); +			if (ret) +				goto out; +		} else if (!strcmp(token, "cdpl2")) { +			ret = cdpl2_enable(); +			if (ret) +				goto out; +		} else { +			ret = -EINVAL; +			goto out; +		}  	} +	return 0; + +out: +	pr_err("Invalid mount option \"%s\"\n", token); +  	return ret;  } @@ -1223,7 +1288,7 @@ out_mongrp:  out_info:  	kernfs_remove(kn_info);  out_cdp: -	cdp_disable(); +	cdp_disable_all();  out:  	rdt_last_cmd_clear();  	mutex_unlock(&rdtgroup_mutex); @@ -1383,7 +1448,7 @@ static void rdt_kill_sb(struct super_block *sb)  	/*Put everything back to default values. */  	for_each_alloc_enabled_rdt_resource(r)  		reset_all_ctrls(r); -	cdp_disable(); +	cdp_disable_all();  	rmdir_all_sub();  	static_branch_disable_cpuslocked(&rdt_alloc_enable_key);  	static_branch_disable_cpuslocked(&rdt_mon_enable_key); diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 4ca632a06e0b..5bbd06f38ff6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -59,6 +59,7 @@ static struct severity {  #define  MCGMASK(x, y)	.mcgmask = x, .mcgres = y  #define  MASK(x, y)	.mask = x, .result = y  #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)  #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)  #define	MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) @@ -101,6 +102,22 @@ static struct severity {  		NOSER, BITCLR(MCI_STATUS_UC)  		), +	/* +	 * known AO MCACODs reported via MCE or CMC: +	 * +	 * SRAO could be signaled either via a machine check exception or +	 * CMCI with the corresponding bit S 1 or 0. So we don't need to +	 * check bit S for SRAO. +	 */ +	MCESEV( +		AO, "Action optional: memory scrubbing error", +		SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) +		), +	MCESEV( +		AO, "Action optional: last level cache writeback error", +		SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) +		), +  	/* ignore OVER for UCNA */  	MCESEV(  		UCNA, "Uncorrected no action required", @@ -149,15 +166,6 @@ static struct severity {  		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)  		), -	/* known AO MCACODs: */ -	MCESEV( -		AO, "Action optional: memory scrubbing error", -		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) -		), -	MCESEV( -		AO, "Action optional: last level cache writeback error", -		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) -		),  	MCESEV(  		SOME, "Action optional: unknown MCACOD",  		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b1d616d08eee..ba1f9555fbc5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -503,10 +503,8 @@ static int mce_usable_address(struct mce *m)  bool mce_is_memory_error(struct mce *m)  {  	if (m->cpuvendor == X86_VENDOR_AMD) { -		/* ErrCodeExt[20:16] */ -		u8 xec = (m->status >> 16) & 0x1f; +		return amd_mce_is_memory_error(m); -		return (xec == 0x0 || xec == 0x8);  	} else if (m->cpuvendor == X86_VENDOR_INTEL) {  		/*  		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes @@ -530,6 +528,17 @@ bool mce_is_memory_error(struct mce *m)  }  EXPORT_SYMBOL_GPL(mce_is_memory_error); +static bool mce_is_correctable(struct mce *m) +{ +	if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) +		return false; + +	if (m->status & MCI_STATUS_UC) +		return false; + +	return true; +} +  static bool cec_add_mce(struct mce *m)  {  	if (!m) @@ -537,7 +546,7 @@ static bool cec_add_mce(struct mce *m)  	/* We eat only correctable DRAM errors with usable addresses. */  	if (mce_is_memory_error(m) && -	    !(m->status & MCI_STATUS_UC) && +	    mce_is_correctable(m)  &&  	    mce_usable_address(m))  		if (!cec_add_elem(m->addr >> PAGE_SHIFT))  			return true; @@ -582,7 +591,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,  	if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {  		pfn = mce->addr >> PAGE_SHIFT; -		memory_failure(pfn, MCE_VECTOR, 0); +		memory_failure(pfn, 0);  	}  	return NOTIFY_OK; @@ -1046,7 +1055,7 @@ static int do_memory_failure(struct mce *m)  	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);  	if (!(m->mcgstatus & MCG_STATUS_RIPV))  		flags |= MF_MUST_KILL; -	ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags); +	ret = memory_failure(m->addr >> PAGE_SHIFT, flags);  	if (ret)  		pr_err("Memory error not recovered");  	return ret; @@ -1325,7 +1334,7 @@ out_ist:  EXPORT_SYMBOL_GPL(do_machine_check);  #ifndef CONFIG_MEMORY_FAILURE -int memory_failure(unsigned long pfn, int vector, int flags) +int memory_failure(unsigned long pfn, int flags)  {  	/* mce_severity() should not hand us an ACTION_REQUIRED error */  	BUG_ON(flags & MF_ACTION_REQUIRED); @@ -1785,6 +1794,11 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code)  void (*machine_check_vector)(struct pt_regs *, long error_code) =  						unexpected_machine_check; +dotraplinkage void do_mce(struct pt_regs *regs, long error_code) +{ +	machine_check_vector(regs, error_code); +} +  /*   * Called for each booted CPU to set up machine checks.   * Must be called with preempt off: diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 486f640b02ef..0f32ad242324 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -110,6 +110,20 @@ const char *smca_get_long_name(enum smca_bank_types t)  }  EXPORT_SYMBOL_GPL(smca_get_long_name); +static enum smca_bank_types smca_get_bank_type(struct mce *m) +{ +	struct smca_bank *b; + +	if (m->bank >= N_SMCA_BANK_TYPES) +		return N_SMCA_BANK_TYPES; + +	b = &smca_banks[m->bank]; +	if (!b->hwid) +		return N_SMCA_BANK_TYPES; + +	return b->hwid->bank_type; +} +  static struct smca_hwid smca_hwid_mcatypes[] = {  	/* { bank_type, hwid_mcatype, xec_bitmap } */ @@ -407,7 +421,9 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)  	    (deferred_error_int_vector != amd_deferred_error_interrupt))  		deferred_error_int_vector = amd_deferred_error_interrupt; -	low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; +	if (!mce_flags.smca) +		low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; +  	wrmsr(MSR_CU_DEF_ERR, low, high);  } @@ -738,6 +754,17 @@ out_err:  }  EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); +bool amd_mce_is_memory_error(struct mce *m) +{ +	/* ErrCodeExt[20:16] */ +	u8 xec = (m->status >> 16) & 0x1f; + +	if (mce_flags.smca) +		return smca_get_bank_type(m) == SMCA_UMC && xec == 0x0; + +	return m->bank == 4 && xec == 0x8; +} +  static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)  {  	struct mce m; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index c4fa4a85d4cb..e4fc595cd6ea 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -239,7 +239,7 @@ static int __init save_microcode_in_initrd(void)  		break;  	case X86_VENDOR_AMD:  		if (c->x86 >= 0x10) -			return save_microcode_in_initrd_amd(cpuid_eax(1)); +			ret = save_microcode_in_initrd_amd(cpuid_eax(1));  		break;  	default:  		break; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 8ccdca6d3f9e..f7c55b0e753a 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -45,6 +45,9 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";  /* Current microcode patch used in early patching on the APs. */  static struct microcode_intel *intel_ucode_patch; +/* last level cache size per core */ +static int llc_size_per_core; +  static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,  					unsigned int s2, unsigned int p2)  { @@ -910,8 +913,19 @@ static bool is_blacklisted(unsigned int cpu)  {  	struct cpuinfo_x86 *c = &cpu_data(cpu); -	if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) { -		pr_err_once("late loading on model 79 is disabled.\n"); +	/* +	 * Late loading on model 79 with microcode revision less than 0x0b000021 +	 * and LLC size per core bigger than 2.5MB may result in a system hang. +	 * This behavior is documented in item BDF90, #334165 (Intel Xeon +	 * Processor E7-8800/4800 v4 Product Family). +	 */ +	if (c->x86 == 6 && +	    c->x86_model == INTEL_FAM6_BROADWELL_X && +	    c->x86_mask == 0x01 && +	    llc_size_per_core > 2621440 && +	    c->microcode < 0x0b000021) { +		pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); +		pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");  		return true;  	} @@ -966,6 +980,15 @@ static struct microcode_ops microcode_intel_ops = {  	.apply_microcode                  = apply_microcode_intel,  }; +static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c) +{ +	u64 llc_size = c->x86_cache_size * 1024; + +	do_div(llc_size, c->x86_max_cores); + +	return (int)llc_size; +} +  struct microcode_ops * __init init_intel_microcode(void)  {  	struct cpuinfo_x86 *c = &boot_cpu_data; @@ -976,5 +999,7 @@ struct microcode_ops * __init init_intel_microcode(void)  		return NULL;  	} +	llc_size_per_core = calc_llc_size_per_core(c); +  	return µcode_intel_ops;  } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 05459ad3db46..4075d2be5357 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -21,12 +21,10 @@ struct cpuid_bit {  static const struct cpuid_bit cpuid_bits[] = {  	{ X86_FEATURE_APERFMPERF,       CPUID_ECX,  0, 0x00000006, 0 },  	{ X86_FEATURE_EPB,		CPUID_ECX,  3, 0x00000006, 0 }, -	{ X86_FEATURE_INTEL_PT,		CPUID_EBX, 25, 0x00000007, 0 }, -	{ X86_FEATURE_AVX512_4VNNIW,    CPUID_EDX,  2, 0x00000007, 0 }, -	{ X86_FEATURE_AVX512_4FMAPS,    CPUID_EDX,  3, 0x00000007, 0 },  	{ X86_FEATURE_CAT_L3,		CPUID_EBX,  1, 0x00000010, 0 },  	{ X86_FEATURE_CAT_L2,		CPUID_EBX,  2, 0x00000010, 0 },  	{ X86_FEATURE_CDP_L3,		CPUID_ECX,  2, 0x00000010, 1 }, +	{ X86_FEATURE_CDP_L2,		CPUID_ECX,  2, 0x00000010, 2 },  	{ X86_FEATURE_MBA,		CPUID_EBX,  3, 0x00000010, 0 },  	{ X86_FEATURE_HW_PSTATE,	CPUID_EDX,  7, 0x80000007, 0 },  	{ X86_FEATURE_CPB,		CPUID_EDX,  9, 0x80000007, 0 }, diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index b6c6468e10bc..4c8440de3355 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -8,6 +8,7 @@  #include <asm/segment.h>  #include <asm/export.h>  #include <asm/ftrace.h> +#include <asm/nospec-branch.h>  #ifdef CC_USING_FENTRY  # define function_hook	__fentry__ @@ -197,7 +198,8 @@ ftrace_stub:  	movl	0x4(%ebp), %edx  	subl	$MCOUNT_INSN_SIZE, %eax -	call	*ftrace_trace_function +	movl	ftrace_trace_function, %ecx +	CALL_NOSPEC %ecx  	popl	%edx  	popl	%ecx @@ -241,5 +243,5 @@ return_to_handler:  	movl	%eax, %ecx  	popl	%edx  	popl	%eax -	jmp	*%ecx +	JMP_NOSPEC %ecx  #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index c832291d948a..91b2cff4b79a 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -7,7 +7,8 @@  #include <asm/ptrace.h>  #include <asm/ftrace.h>  #include <asm/export.h> - +#include <asm/nospec-branch.h> +#include <asm/unwind_hints.h>  	.code64  	.section .entry.text, "ax" @@ -20,7 +21,6 @@ EXPORT_SYMBOL(__fentry__)  EXPORT_SYMBOL(mcount)  #endif -/* All cases save the original rbp (8 bytes) */  #ifdef CONFIG_FRAME_POINTER  # ifdef CC_USING_FENTRY  /* Save parent and function stack frames (rip and rbp) */ @@ -31,7 +31,7 @@ EXPORT_SYMBOL(mcount)  # endif  #else  /* No need to save a stack frame */ -# define MCOUNT_FRAME_SIZE	8 +# define MCOUNT_FRAME_SIZE	0  #endif /* CONFIG_FRAME_POINTER */  /* Size of stack used to save mcount regs in save_mcount_regs */ @@ -64,10 +64,10 @@ EXPORT_SYMBOL(mcount)   */  .macro save_mcount_regs added=0 -	/* Always save the original rbp */ +#ifdef CONFIG_FRAME_POINTER +	/* Save the original rbp */  	pushq %rbp -#ifdef CONFIG_FRAME_POINTER  	/*  	 * Stack traces will stop at the ftrace trampoline if the frame pointer  	 * is not set up properly. If fentry is used, we need to save a frame @@ -105,7 +105,11 @@ EXPORT_SYMBOL(mcount)  	 * Save the original RBP. Even though the mcount ABI does not  	 * require this, it helps out callers.  	 */ +#ifdef CONFIG_FRAME_POINTER  	movq MCOUNT_REG_SIZE-8(%rsp), %rdx +#else +	movq %rbp, %rdx +#endif  	movq %rdx, RBP(%rsp)  	/* Copy the parent address into %rsi (second parameter) */ @@ -148,7 +152,7 @@ EXPORT_SYMBOL(mcount)  ENTRY(function_hook)  	retq -END(function_hook) +ENDPROC(function_hook)  ENTRY(ftrace_caller)  	/* save_mcount_regs fills in first two parameters */ @@ -184,7 +188,7 @@ GLOBAL(ftrace_graph_call)  /* This is weak to keep gas from relaxing the jumps */  WEAK(ftrace_stub)  	retq -END(ftrace_caller) +ENDPROC(ftrace_caller)  ENTRY(ftrace_regs_caller)  	/* Save the current flags before any operations that can change them */ @@ -255,7 +259,7 @@ GLOBAL(ftrace_regs_caller_end)  	jmp ftrace_epilogue -END(ftrace_regs_caller) +ENDPROC(ftrace_regs_caller)  #else /* ! CONFIG_DYNAMIC_FTRACE */ @@ -286,12 +290,12 @@ trace:  	 * ip and parent ip are used and the list function is called when  	 * function tracing is enabled.  	 */ -	call   *ftrace_trace_function - +	movq ftrace_trace_function, %r8 +	CALL_NOSPEC %r8  	restore_mcount_regs  	jmp fgraph_trace -END(function_hook) +ENDPROC(function_hook)  #endif /* CONFIG_DYNAMIC_FTRACE */  #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -313,9 +317,10 @@ ENTRY(ftrace_graph_caller)  	restore_mcount_regs  	retq -END(ftrace_graph_caller) +ENDPROC(ftrace_graph_caller) -GLOBAL(return_to_handler) +ENTRY(return_to_handler) +	UNWIND_HINT_EMPTY  	subq  $24, %rsp  	/* Save the return values */ @@ -329,5 +334,6 @@ GLOBAL(return_to_handler)  	movq 8(%rsp), %rdx  	movq (%rsp), %rax  	addq $24, %rsp -	jmp *%rdi +	JMP_NOSPEC %rdi +END(return_to_handler)  #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6a5d757b9cfd..7ba5d819ebe3 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -157,8 +157,8 @@ unsigned long __head __startup_64(unsigned long physaddr,  	p = fixup_pointer(&phys_base, physaddr);  	*p += load_delta - sme_get_me_mask(); -	/* Encrypt the kernel (if SME is active) */ -	sme_encrypt_kernel(); +	/* Encrypt the kernel and related (if SME is active) */ +	sme_encrypt_kernel(bp);  	/*  	 * Return the SME encryption mask (if SME is active) to be used as a diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index d985cef3984f..56d99be3706a 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -56,7 +56,7 @@ struct idt_data {   * Early traps running on the DEFAULT_STACK because the other interrupt   * stacks work only after cpu_init().   */ -static const __initdata struct idt_data early_idts[] = { +static const __initconst struct idt_data early_idts[] = {  	INTG(X86_TRAP_DB,		debug),  	SYSG(X86_TRAP_BP,		int3),  #ifdef CONFIG_X86_32 @@ -70,7 +70,7 @@ static const __initdata struct idt_data early_idts[] = {   * the traps which use them are reinitialized with IST after cpu_init() has   * set up TSS.   */ -static const __initdata struct idt_data def_idts[] = { +static const __initconst struct idt_data def_idts[] = {  	INTG(X86_TRAP_DE,		divide_error),  	INTG(X86_TRAP_NMI,		nmi),  	INTG(X86_TRAP_BR,		bounds), @@ -108,7 +108,7 @@ static const __initdata struct idt_data def_idts[] = {  /*   * The APIC and SMP idt entries   */ -static const __initdata struct idt_data apic_idts[] = { +static const __initconst struct idt_data apic_idts[] = {  #ifdef CONFIG_SMP  	INTG(RESCHEDULE_VECTOR,		reschedule_interrupt),  	INTG(CALL_FUNCTION_VECTOR,	call_function_interrupt), @@ -150,7 +150,7 @@ static const __initdata struct idt_data apic_idts[] = {   * Early traps running on the DEFAULT_STACK because the other interrupt   * stacks work only after cpu_init().   */ -static const __initdata struct idt_data early_pf_idts[] = { +static const __initconst struct idt_data early_pf_idts[] = {  	INTG(X86_TRAP_PF,		page_fault),  }; @@ -158,7 +158,7 @@ static const __initdata struct idt_data early_pf_idts[] = {   * Override for the debug_idt. Same as the default, but with interrupt   * stack set to DEFAULT_STACK (0). Required for NMI trap handling.   */ -static const __initdata struct idt_data dbg_idts[] = { +static const __initconst struct idt_data dbg_idts[] = {  	INTG(X86_TRAP_DB,	debug),  	INTG(X86_TRAP_BP,	int3),  }; @@ -180,7 +180,7 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;   * The exceptions which use Interrupt stacks. They are setup after   * cpu_init() when the TSS has been initialized.   */ -static const __initdata struct idt_data ist_idts[] = { +static const __initconst struct idt_data ist_idts[] = {  	ISTG(X86_TRAP_DB,	debug,		DEBUG_STACK),  	ISTG(X86_TRAP_NMI,	nmi,		NMI_STACK),  	SISTG(X86_TRAP_BP,	int3,		DEBUG_STACK), diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index a83b3346a0e1..c1bdbd3d3232 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -20,6 +20,7 @@  #include <linux/mm.h>  #include <asm/apic.h> +#include <asm/nospec-branch.h>  #ifdef CONFIG_DEBUG_STACKOVERFLOW @@ -55,11 +56,11 @@ DEFINE_PER_CPU(struct irq_stack *, softirq_stack);  static void call_on_stack(void *func, void *stack)  {  	asm volatile("xchgl	%%ebx,%%esp	\n" -		     "call	*%%edi		\n" +		     CALL_NOSPEC  		     "movl	%%ebx,%%esp	\n"  		     : "=b" (stack)  		     : "0" (stack), -		       "D"(func) +		       [thunk_target] "D"(func)  		     : "memory", "cc", "edx", "ecx", "eax");  } @@ -95,11 +96,11 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)  		call_on_stack(print_stack_overflow, isp);  	asm volatile("xchgl	%%ebx,%%esp	\n" -		     "call	*%%edi		\n" +		     CALL_NOSPEC  		     "movl	%%ebx,%%esp	\n"  		     : "=a" (arg1), "=b" (isp)  		     :  "0" (desc),   "1" (isp), -			"D" (desc->handle_irq) +			[thunk_target] "D" (desc->handle_irq)  		     : "memory", "cc", "ecx");  	return 1;  } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 8da3e909e967..a539410c4ea9 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -61,6 +61,9 @@ void __init init_ISA_irqs(void)  	struct irq_chip *chip = legacy_pic->chip;  	int i; +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) +	init_bsp_APIC(); +#endif  	legacy_pic->init(0);  	for (i = 0; i < nr_legacy_irqs(); i++) diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index f73f475d0573..d177940aa090 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -24,7 +24,6 @@  #include <linux/cpumask.h>  #include <linux/cpuset.h>  #include <linux/mutex.h> -#include <linux/sched.h>  #include <linux/sysctl.h>  #include <linux/nodemask.h> diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c new file mode 100644 index 000000000000..b68fd895235a --- /dev/null +++ b/arch/x86/kernel/jailhouse.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL2.0 +/* + * Jailhouse paravirt_ops implementation + * + * Copyright (c) Siemens AG, 2015-2017 + * + * Authors: + *  Jan Kiszka <jan.kiszka@siemens.com> + */ + +#include <linux/acpi_pmtmr.h> +#include <linux/kernel.h> +#include <linux/reboot.h> +#include <asm/apic.h> +#include <asm/cpu.h> +#include <asm/hypervisor.h> +#include <asm/i8259.h> +#include <asm/irqdomain.h> +#include <asm/pci_x86.h> +#include <asm/reboot.h> +#include <asm/setup.h> + +static __initdata struct jailhouse_setup_data setup_data; +static unsigned int precalibrated_tsc_khz; + +static uint32_t jailhouse_cpuid_base(void) +{ +	if (boot_cpu_data.cpuid_level < 0 || +	    !boot_cpu_has(X86_FEATURE_HYPERVISOR)) +		return 0; + +	return hypervisor_cpuid_base("Jailhouse\0\0\0", 0); +} + +static uint32_t __init jailhouse_detect(void) +{ +	return jailhouse_cpuid_base(); +} + +static void jailhouse_get_wallclock(struct timespec *now) +{ +	memset(now, 0, sizeof(*now)); +} + +static void __init jailhouse_timer_init(void) +{ +	lapic_timer_frequency = setup_data.apic_khz * (1000 / HZ); +} + +static unsigned long jailhouse_get_tsc(void) +{ +	return precalibrated_tsc_khz; +} + +static void __init jailhouse_x2apic_init(void) +{ +#ifdef CONFIG_X86_X2APIC +	if (!x2apic_enabled()) +		return; +	/* +	 * We do not have access to IR inside Jailhouse non-root cells.  So +	 * we have to run in physical mode. +	 */ +	x2apic_phys = 1; +	/* +	 * This will trigger the switch to apic_x2apic_phys.  Empty OEM IDs +	 * ensure that only this APIC driver picks up the call. +	 */ +	default_acpi_madt_oem_check("", ""); +#endif +} + +static void __init jailhouse_get_smp_config(unsigned int early) +{ +	struct ioapic_domain_cfg ioapic_cfg = { +		.type = IOAPIC_DOMAIN_STRICT, +		.ops = &mp_ioapic_irqdomain_ops, +	}; +	struct mpc_intsrc mp_irq = { +		.type = MP_INTSRC, +		.irqtype = mp_INT, +		.irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE, +	}; +	unsigned int cpu; + +	jailhouse_x2apic_init(); + +	register_lapic_address(0xfee00000); + +	for (cpu = 0; cpu < setup_data.num_cpus; cpu++) { +		generic_processor_info(setup_data.cpu_ids[cpu], +				       boot_cpu_apic_version); +	} + +	smp_found_config = 1; + +	if (setup_data.standard_ioapic) { +		mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg); + +		/* Register 1:1 mapping for legacy UART IRQs 3 and 4 */ +		mp_irq.srcbusirq = mp_irq.dstirq = 3; +		mp_save_irq(&mp_irq); + +		mp_irq.srcbusirq = mp_irq.dstirq = 4; +		mp_save_irq(&mp_irq); +	} +} + +static void jailhouse_no_restart(void) +{ +	pr_notice("Jailhouse: Restart not supported, halting\n"); +	machine_halt(); +} + +static int __init jailhouse_pci_arch_init(void) +{ +	pci_direct_init(1); + +	/* +	 * There are no bridges on the virtual PCI root bus under Jailhouse, +	 * thus no other way to discover all devices than a full scan. +	 * Respect any overrides via the command line, though. +	 */ +	if (pcibios_last_bus < 0) +		pcibios_last_bus = 0xff; + +	return 0; +} + +static void __init jailhouse_init_platform(void) +{ +	u64 pa_data = boot_params.hdr.setup_data; +	struct setup_data header; +	void *mapping; + +	x86_init.irqs.pre_vector_init	= x86_init_noop; +	x86_init.timers.timer_init	= jailhouse_timer_init; +	x86_init.mpparse.get_smp_config	= jailhouse_get_smp_config; +	x86_init.pci.arch_init		= jailhouse_pci_arch_init; + +	x86_platform.calibrate_cpu	= jailhouse_get_tsc; +	x86_platform.calibrate_tsc	= jailhouse_get_tsc; +	x86_platform.get_wallclock	= jailhouse_get_wallclock; +	x86_platform.legacy.rtc		= 0; +	x86_platform.legacy.warm_reset	= 0; +	x86_platform.legacy.i8042	= X86_LEGACY_I8042_PLATFORM_ABSENT; + +	legacy_pic			= &null_legacy_pic; + +	machine_ops.emergency_restart	= jailhouse_no_restart; + +	while (pa_data) { +		mapping = early_memremap(pa_data, sizeof(header)); +		memcpy(&header, mapping, sizeof(header)); +		early_memunmap(mapping, sizeof(header)); + +		if (header.type == SETUP_JAILHOUSE && +		    header.len >= sizeof(setup_data)) { +			pa_data += offsetof(struct setup_data, data); + +			mapping = early_memremap(pa_data, sizeof(setup_data)); +			memcpy(&setup_data, mapping, sizeof(setup_data)); +			early_memunmap(mapping, sizeof(setup_data)); + +			break; +		} + +		pa_data = header.next; +	} + +	if (!pa_data) +		panic("Jailhouse: No valid setup data found"); + +	if (setup_data.compatible_version > JAILHOUSE_SETUP_REQUIRED_VERSION) +		panic("Jailhouse: Unsupported setup data structure"); + +	pmtmr_ioport = setup_data.pm_timer_address; +	pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport); + +	precalibrated_tsc_khz = setup_data.tsc_khz; +	setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + +	pci_probe = 0; + +	/* +	 * Avoid that the kernel complains about missing ACPI tables - there +	 * are none in a non-root cell. +	 */ +	disable_acpi(); +} + +bool jailhouse_paravirt(void) +{ +	return jailhouse_cpuid_base() != 0; +} + +static bool jailhouse_x2apic_available(void) +{ +	/* +	 * The x2APIC is only available if the root cell enabled it. Jailhouse +	 * does not support switching between xAPIC and x2APIC. +	 */ +	return x2apic_enabled(); +} + +const struct hypervisor_x86 x86_hyper_jailhouse __refconst = { +	.name			= "Jailhouse", +	.detect			= jailhouse_detect, +	.init.init_platform	= jailhouse_init_platform, +	.init.x2apic_available	= jailhouse_x2apic_available, +}; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index e941136e24d8..203d398802a3 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -40,6 +40,7 @@  #include <asm/debugreg.h>  #include <asm/set_memory.h>  #include <asm/sections.h> +#include <asm/nospec-branch.h>  #include "common.h" @@ -203,7 +204,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)  }  /* Check whether insn is indirect jump */ -static int insn_is_indirect_jump(struct insn *insn) +static int __insn_is_indirect_jump(struct insn *insn)  {  	return ((insn->opcode.bytes[0] == 0xff &&  		(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ @@ -237,6 +238,26 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)  	return (start <= target && target <= start + len);  } +static int insn_is_indirect_jump(struct insn *insn) +{ +	int ret = __insn_is_indirect_jump(insn); + +#ifdef CONFIG_RETPOLINE +	/* +	 * Jump to x86_indirect_thunk_* is treated as an indirect jump. +	 * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with +	 * older gcc may use indirect jump. So we add this check instead of +	 * replace indirect-jump check. +	 */ +	if (!ret) +		ret = insn_jump_into_range(insn, +				(unsigned long)__indirect_thunk_start, +				(unsigned long)__indirect_thunk_end - +				(unsigned long)__indirect_thunk_start); +#endif +	return ret; +} +  /* Decode whole function to ensure any instructions don't jump into target */  static int can_optimize(unsigned long paddr)  { diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 3a4b12809ab5..27d0a1712663 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -281,7 +281,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)  	int ELCR_fallback = 0;  	intsrc.type = MP_INTSRC; -	intsrc.irqflag = 0;	/* conforming */ +	intsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT;  	intsrc.srcbus = 0;  	intsrc.dstapic = mpc_ioapic_id(0); @@ -324,10 +324,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)  			 *  copy that information over to the MP table in the  			 *  irqflag field (level sensitive, active high polarity).  			 */ -			if (ELCR_trigger(i)) -				intsrc.irqflag = 13; -			else -				intsrc.irqflag = 0; +			if (ELCR_trigger(i)) { +				intsrc.irqflag = MP_IRQTRIG_LEVEL | +						 MP_IRQPOL_ACTIVE_HIGH; +			} else { +				intsrc.irqflag = MP_IRQTRIG_DEFAULT | +						 MP_IRQPOL_DEFAULT; +			}  		}  		intsrc.srcbusirq = i; @@ -419,7 +422,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)  	construct_ioapic_table(mpc_default_type);  	lintsrc.type = MP_LINTSRC; -	lintsrc.irqflag = 0;		/* conforming */ +	lintsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT;  	lintsrc.srcbusid = 0;  	lintsrc.srcbusirq = 0;  	lintsrc.destapic = MP_APIC_ALL; @@ -664,7 +667,7 @@ static int  __init get_MP_intsrc_index(struct mpc_intsrc *m)  	if (m->irqtype != mp_INT)  		return 0; -	if (m->irqflag != 0x0f) +	if (m->irqflag != (MP_IRQTRIG_LEVEL | MP_IRQPOL_ACTIVE_LOW))  		return 0;  	/* not legacy */ @@ -673,7 +676,8 @@ static int  __init get_MP_intsrc_index(struct mpc_intsrc *m)  		if (mp_irqs[i].irqtype != mp_INT)  			continue; -		if (mp_irqs[i].irqflag != 0x0f) +		if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL | +					   MP_IRQPOL_ACTIVE_LOW))  			continue;  		if (mp_irqs[i].srcbus != m->srcbus) @@ -784,7 +788,8 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,  		if (mp_irqs[i].irqtype != mp_INT)  			continue; -		if (mp_irqs[i].irqflag != 0x0f) +		if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL | +					   MP_IRQPOL_ACTIVE_LOW))  			continue;  		if (nr_m_spare > 0) { diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 39a59299bfa0..235fe6008ac8 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -9,6 +9,7 @@ void __init x86_early_init_platform_quirks(void)  {  	x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT;  	x86_platform.legacy.rtc = 1; +	x86_platform.legacy.warm_reset = 1;  	x86_platform.legacy.reserve_bios_regions = 0;  	x86_platform.legacy.devices.pnpbios = 1; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 832a6acd730f..03408b942adb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -21,7 +21,6 @@  #include <linux/dmi.h>  #include <linux/utsname.h>  #include <linux/stackprotector.h> -#include <linux/tick.h>  #include <linux/cpuidle.h>  #include <trace/events/power.h>  #include <linux/hw_breakpoint.h> @@ -380,19 +379,24 @@ void stop_this_cpu(void *dummy)  	disable_local_APIC();  	mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); +	/* +	 * Use wbinvd on processors that support SME. This provides support +	 * for performing a successful kexec when going from SME inactive +	 * to SME active (or vice-versa). The cache must be cleared so that +	 * if there are entries with the same physical address, both with and +	 * without the encryption bit, they don't race each other when flushed +	 * and potentially end up with the wrong entry being committed to +	 * memory. +	 */ +	if (boot_cpu_has(X86_FEATURE_SME)) +		native_wbinvd();  	for (;;) {  		/* -		 * Use wbinvd followed by hlt to stop the processor. This -		 * provides support for kexec on a processor that supports -		 * SME. With kexec, going from SME inactive to SME active -		 * requires clearing cache entries so that addresses without -		 * the encryption bit set don't corrupt the same physical -		 * address that has the encryption bit set when caches are -		 * flushed. To achieve this a wbinvd is performed followed by -		 * a hlt. Even if the processor is not in the kexec/SME -		 * scenario this only adds a wbinvd to a halting processor. +		 * Use native_halt() so that memory contents don't change +		 * (stack usage and variables) after possibly issuing the +		 * native_wbinvd() above.  		 */ -		asm volatile("wbinvd; hlt" : : : "memory"); +		native_halt();  	}  } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 145810b0edf6..1ae67e982af7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -114,7 +114,6 @@  #include <asm/alternative.h>  #include <asm/prom.h>  #include <asm/microcode.h> -#include <asm/mmu_context.h>  #include <asm/kaslr.h>  #include <asm/unwind.h> @@ -364,16 +363,6 @@ static void __init reserve_initrd(void)  	    !ramdisk_image || !ramdisk_size)  		return;		/* No initrd provided by bootloader */ -	/* -	 * If SME is active, this memory will be marked encrypted by the -	 * kernel when it is accessed (including relocation). However, the -	 * ramdisk image was loaded decrypted by the bootloader, so make -	 * sure that it is encrypted before accessing it. For SEV the -	 * ramdisk will already be encrypted, so only do this for SME. -	 */ -	if (sme_active()) -		sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image); -  	initrd_start = 0;  	mapped_size = memblock_mem_size(max_pfn_mapped); diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 8c6da1a643da..ac057f9b0763 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -25,8 +25,8 @@ static inline void signal_compat_build_tests(void)  	 * limits also have to look at this code.  Make sure any  	 * new fields are handled in copy_siginfo_to_user32()!  	 */ -	BUILD_BUG_ON(NSIGILL  != 8); -	BUILD_BUG_ON(NSIGFPE  != 8); +	BUILD_BUG_ON(NSIGILL  != 11); +	BUILD_BUG_ON(NSIGFPE  != 13);  	BUILD_BUG_ON(NSIGSEGV != 4);  	BUILD_BUG_ON(NSIGBUS  != 5);  	BUILD_BUG_ON(NSIGTRAP != 4); @@ -64,7 +64,7 @@ static inline void signal_compat_build_tests(void)  	CHECK_SI_SIZE   (_kill, 2*sizeof(int));  	CHECK_CSI_OFFSET(_timer); -	CHECK_CSI_SIZE  (_timer, 5*sizeof(int)); +	CHECK_CSI_SIZE  (_timer, 3*sizeof(int));  	CHECK_SI_SIZE   (_timer, 6*sizeof(int));  	CHECK_CSI_OFFSET(_rt); @@ -75,9 +75,11 @@ static inline void signal_compat_build_tests(void)  	CHECK_CSI_SIZE  (_sigchld, 5*sizeof(int));  	CHECK_SI_SIZE   (_sigchld, 8*sizeof(int)); +#ifdef CONFIG_X86_X32_ABI  	CHECK_CSI_OFFSET(_sigchld_x32);  	CHECK_CSI_SIZE  (_sigchld_x32, 7*sizeof(int));  	/* no _sigchld_x32 in the generic siginfo_t */ +#endif  	CHECK_CSI_OFFSET(_sigfault);  	CHECK_CSI_SIZE  (_sigfault, 4*sizeof(int)); @@ -96,6 +98,8 @@ static inline void signal_compat_build_tests(void)  void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)  { +	signal_compat_build_tests(); +  	/* Don't leak in-kernel non-uapi flags to user-space */  	if (oact)  		oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); @@ -111,116 +115,3 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)  	if (in_x32_syscall())  		act->sa.sa_flags |= SA_X32_ABI;  } - -int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, -		bool x32_ABI) -{ -	int err = 0; - -	signal_compat_build_tests(); - -	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) -		return -EFAULT; - -	put_user_try { -		/* If you change siginfo_t structure, please make sure that -		   this code is fixed accordingly. -		   It should never copy any pad contained in the structure -		   to avoid security leaks, but must copy the generic -		   3 ints plus the relevant union member.  */ -		put_user_ex(from->si_signo, &to->si_signo); -		put_user_ex(from->si_errno, &to->si_errno); -		put_user_ex(from->si_code, &to->si_code); - -		if (from->si_code < 0) { -			put_user_ex(from->si_pid, &to->si_pid); -			put_user_ex(from->si_uid, &to->si_uid); -			put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); -		} else { -			/* -			 * First 32bits of unions are always present: -			 * si_pid === si_band === si_tid === si_addr(LS half) -			 */ -			put_user_ex(from->_sifields._pad[0], -					  &to->_sifields._pad[0]); -			switch (siginfo_layout(from->si_signo, from->si_code)) { -			case SIL_FAULT: -				if (from->si_signo == SIGBUS && -				    (from->si_code == BUS_MCEERR_AR || -				     from->si_code == BUS_MCEERR_AO)) -					put_user_ex(from->si_addr_lsb, &to->si_addr_lsb); - -				if (from->si_signo == SIGSEGV) { -					if (from->si_code == SEGV_BNDERR) { -						compat_uptr_t lower = (unsigned long)from->si_lower; -						compat_uptr_t upper = (unsigned long)from->si_upper; -						put_user_ex(lower, &to->si_lower); -						put_user_ex(upper, &to->si_upper); -					} -					if (from->si_code == SEGV_PKUERR) -						put_user_ex(from->si_pkey, &to->si_pkey); -				} -				break; -			case SIL_SYS: -				put_user_ex(from->si_syscall, &to->si_syscall); -				put_user_ex(from->si_arch, &to->si_arch); -				break; -			case SIL_CHLD: -				if (!x32_ABI) { -					put_user_ex(from->si_utime, &to->si_utime); -					put_user_ex(from->si_stime, &to->si_stime); -				} else { -					put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime); -					put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime); -				} -				put_user_ex(from->si_status, &to->si_status); -				/* FALL THROUGH */ -			case SIL_KILL: -				put_user_ex(from->si_uid, &to->si_uid); -				break; -			case SIL_POLL: -				put_user_ex(from->si_fd, &to->si_fd); -				break; -			case SIL_TIMER: -				put_user_ex(from->si_overrun, &to->si_overrun); -				put_user_ex(ptr_to_compat(from->si_ptr), -					    &to->si_ptr); -				break; -			case SIL_RT: -				put_user_ex(from->si_uid, &to->si_uid); -				put_user_ex(from->si_int, &to->si_int); -				break; -			} -		} -	} put_user_catch(err); - -	return err; -} - -/* from syscall's path, where we know the ABI */ -int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) -{ -	return __copy_siginfo_to_user32(to, from, in_x32_syscall()); -} - -int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) -{ -	int err = 0; -	u32 ptr32; - -	if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) -		return -EFAULT; - -	get_user_try { -		get_user_ex(to->si_signo, &from->si_signo); -		get_user_ex(to->si_errno, &from->si_errno); -		get_user_ex(to->si_code, &from->si_code); - -		get_user_ex(to->si_pid, &from->si_pid); -		get_user_ex(to->si_uid, &from->si_uid); -		get_user_ex(ptr32, &from->si_ptr); -		to->si_ptr = compat_ptr(ptr32); -	} get_user_catch(err); - -	return err; -} diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ed556d50d7ed..6f27facbaa9b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -75,7 +75,6 @@  #include <asm/uv/uv.h>  #include <linux/mc146818rtc.h>  #include <asm/i8259.h> -#include <asm/realmode.h>  #include <asm/misc.h>  #include <asm/qspinlock.h> @@ -934,7 +933,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,  	 * the targeted processor.  	 */ -	if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { +	if (x86_platform.legacy.warm_reset) {  		pr_debug("Setting warm reset code and vector.\n"); @@ -1006,7 +1005,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,  	/* mark "stuck" area as not stuck */  	*trampoline_status = 0; -	if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { +	if (x86_platform.legacy.warm_reset) {  		/*  		 * Cleanup possible dangling ends...  		 */ diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index a4eb27918ceb..a2486f444073 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -138,6 +138,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,  		return -1;  	set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));  	pte_unmap(pte); + +	/* +	 * PTI poisons low addresses in the kernel page tables in the +	 * name of making them unusable for userspace.  To execute +	 * code at such a low address, the poison must be cleared. +	 * +	 * Note: 'pgd' actually gets set in p4d_alloc() _or_ +	 * pud_alloc() depending on 4/5-level paging. +	 */ +	pgd->pgd &= ~_PAGE_NX; +  	return 0;  } diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 749d189f8cd4..774ebafa97c4 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -69,9 +69,12 @@ static struct irqaction irq0  = {  static void __init setup_default_timer_irq(void)  { -	if (!nr_legacy_irqs()) -		return; -	setup_irq(0, &irq0); +	/* +	 * Unconditionally register the legacy timer; even without legacy +	 * PIC/PIT we need this for the HPET0 in legacy replacement mode. +	 */ +	if (setup_irq(0, &irq0)) +		pr_info("Failed to register legacy timer interrupt\n");  }  /* Default timer init function */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 8ea117f8142e..fb4302738410 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -25,6 +25,7 @@  #include <asm/geode.h>  #include <asm/apic.h>  #include <asm/intel-family.h> +#include <asm/i8259.h>  unsigned int __read_mostly cpu_khz;	/* TSC clocks / usec, not used here */  EXPORT_SYMBOL(cpu_khz); @@ -363,6 +364,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)  	unsigned long tscmin, tscmax;  	int pitcnt; +	if (!has_legacy_pic()) { +		/* +		 * Relies on tsc_early_delay_calibrate() to have given us semi +		 * usable udelay(), wait for the same 50ms we would have with +		 * the PIT loop below. +		 */ +		udelay(10 * USEC_PER_MSEC); +		udelay(10 * USEC_PER_MSEC); +		udelay(10 * USEC_PER_MSEC); +		udelay(10 * USEC_PER_MSEC); +		udelay(10 * USEC_PER_MSEC); +		return ULONG_MAX; +	} +  	/* Set the Gate high, disable speaker */  	outb((inb(0x61) & ~0x02) | 0x01, 0x61); @@ -487,6 +502,9 @@ static unsigned long quick_pit_calibrate(void)  	u64 tsc, delta;  	unsigned long d1, d2; +	if (!has_legacy_pic()) +		return 0; +  	/* Set the Gate high, disable speaker */  	outb((inb(0x61) & ~0x02) | 0x01, 0x61); @@ -602,7 +620,6 @@ unsigned long native_calibrate_tsc(void)  		case INTEL_FAM6_KABYLAKE_DESKTOP:  			crystal_khz = 24000;	/* 24.0 MHz */  			break; -		case INTEL_FAM6_SKYLAKE_X:  		case INTEL_FAM6_ATOM_DENVERTON:  			crystal_khz = 25000;	/* 25.0 MHz */  			break; @@ -612,6 +629,8 @@ unsigned long native_calibrate_tsc(void)  		}  	} +	if (crystal_khz == 0) +		return 0;  	/*  	 * TSC frequency determined by CPUID is a "hardware reported"  	 * frequency and is the most accurate one so far we have. This @@ -987,8 +1006,6 @@ static void __init detect_art(void)  /* clocksource code */ -static struct clocksource clocksource_tsc; -  static void tsc_resume(struct clocksource *cs)  {  	tsc_verify_tsc_adjust(true); @@ -1039,12 +1056,31 @@ static void tsc_cs_tick_stable(struct clocksource *cs)  /*   * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()   */ +static struct clocksource clocksource_tsc_early = { +	.name                   = "tsc-early", +	.rating                 = 299, +	.read                   = read_tsc, +	.mask                   = CLOCKSOURCE_MASK(64), +	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS | +				  CLOCK_SOURCE_MUST_VERIFY, +	.archdata               = { .vclock_mode = VCLOCK_TSC }, +	.resume			= tsc_resume, +	.mark_unstable		= tsc_cs_mark_unstable, +	.tick_stable		= tsc_cs_tick_stable, +}; + +/* + * Must mark VALID_FOR_HRES early such that when we unregister tsc_early + * this one will immediately take over. We will only register if TSC has + * been found good. + */  static struct clocksource clocksource_tsc = {  	.name                   = "tsc",  	.rating                 = 300,  	.read                   = read_tsc,  	.mask                   = CLOCKSOURCE_MASK(64),  	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS | +				  CLOCK_SOURCE_VALID_FOR_HRES |  				  CLOCK_SOURCE_MUST_VERIFY,  	.archdata               = { .vclock_mode = VCLOCK_TSC },  	.resume			= tsc_resume, @@ -1168,8 +1204,8 @@ static void tsc_refine_calibration_work(struct work_struct *work)  	int cpu;  	/* Don't bother refining TSC on unstable systems */ -	if (check_tsc_unstable()) -		goto out; +	if (tsc_unstable) +		return;  	/*  	 * Since the work is started early in boot, we may be @@ -1221,9 +1257,13 @@ static void tsc_refine_calibration_work(struct work_struct *work)  		set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);  out: +	if (tsc_unstable) +		return; +  	if (boot_cpu_has(X86_FEATURE_ART))  		art_related_clocksource = &clocksource_tsc;  	clocksource_register_khz(&clocksource_tsc, tsc_khz); +	clocksource_unregister(&clocksource_tsc_early);  } @@ -1232,13 +1272,11 @@ static int __init init_tsc_clocksource(void)  	if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz)  		return 0; +	if (check_tsc_unstable()) +		return 0; +  	if (tsc_clocksource_reliable)  		clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; -	/* lower the rating if we already know its unstable: */ -	if (check_tsc_unstable()) { -		clocksource_tsc.rating = 0; -		clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; -	}  	if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))  		clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; @@ -1251,6 +1289,7 @@ static int __init init_tsc_clocksource(void)  		if (boot_cpu_has(X86_FEATURE_ART))  			art_related_clocksource = &clocksource_tsc;  		clocksource_register_khz(&clocksource_tsc, tsc_khz); +		clocksource_unregister(&clocksource_tsc_early);  		return 0;  	} @@ -1315,6 +1354,12 @@ void __init tsc_init(void)  		(unsigned long)cpu_khz / 1000,  		(unsigned long)cpu_khz % 1000); +	if (cpu_khz != tsc_khz) { +		pr_info("Detected %lu.%03lu MHz TSC", +			(unsigned long)tsc_khz / 1000, +			(unsigned long)tsc_khz % 1000); +	} +  	/* Sanitize TSC ADJUST before cyc2ns gets initialized */  	tsc_store_and_check_tsc_adjust(true); @@ -1349,9 +1394,12 @@ void __init tsc_init(void)  	check_system_tsc_reliable(); -	if (unsynchronized_tsc()) +	if (unsynchronized_tsc()) {  		mark_tsc_unstable("TSCs unsynchronized"); +		return; +	} +	clocksource_register_khz(&clocksource_tsc_early, tsc_khz);  	detect_art();  } diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index be86a865087a..1f9188f5357c 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -74,8 +74,50 @@ static struct orc_entry *orc_module_find(unsigned long ip)  }  #endif +#ifdef CONFIG_DYNAMIC_FTRACE +static struct orc_entry *orc_find(unsigned long ip); + +/* + * Ftrace dynamic trampolines do not have orc entries of their own. + * But they are copies of the ftrace entries that are static and + * defined in ftrace_*.S, which do have orc entries. + * + * If the undwinder comes across a ftrace trampoline, then find the + * ftrace function that was used to create it, and use that ftrace + * function's orc entrie, as the placement of the return code in + * the stack will be identical. + */ +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ +	struct ftrace_ops *ops; +	unsigned long caller; + +	ops = ftrace_ops_trampoline(ip); +	if (!ops) +		return NULL; + +	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) +		caller = (unsigned long)ftrace_regs_call; +	else +		caller = (unsigned long)ftrace_call; + +	/* Prevent unlikely recursion */ +	if (ip == caller) +		return NULL; + +	return orc_find(caller); +} +#else +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ +	return NULL; +} +#endif +  static struct orc_entry *orc_find(unsigned long ip)  { +	static struct orc_entry *orc; +  	if (!orc_init)  		return NULL; @@ -111,7 +153,11 @@ static struct orc_entry *orc_find(unsigned long ip)  				  __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);  	/* Module lookup: */ -	return orc_module_find(ip); +	orc = orc_module_find(ip); +	if (orc) +		return orc; + +	return orc_ftrace_find(ip);  }  static void orc_sort_swap(void *_a, void *_b, int size) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index a3755d293a48..85c7ef23d99f 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -528,11 +528,11 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)  	return 0;  } -static int push_ret_address(struct pt_regs *regs, unsigned long ip) +static int emulate_push_stack(struct pt_regs *regs, unsigned long val)  {  	unsigned long new_sp = regs->sp - sizeof_long(); -	if (copy_to_user((void __user *)new_sp, &ip, sizeof_long())) +	if (copy_to_user((void __user *)new_sp, &val, sizeof_long()))  		return -EFAULT;  	regs->sp = new_sp; @@ -566,7 +566,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs  		regs->ip += correction;  	} else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {  		regs->sp += sizeof_long(); /* Pop incorrect return address */ -		if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen)) +		if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))  			return -ERESTART;  	}  	/* popf; tell the caller to not touch TF */ @@ -655,7 +655,7 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)  		 *  		 * But there is corner case, see the comment in ->post_xol().  		 */ -		if (push_ret_address(regs, new_ip)) +		if (emulate_push_stack(regs, new_ip))  			return false;  	} else if (!check_jmp_cond(auprobe, regs)) {  		offs = 0; @@ -665,6 +665,16 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)  	return true;  } +static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ +	unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset; + +	if (emulate_push_stack(regs, *src_ptr)) +		return false; +	regs->ip += auprobe->push.ilen; +	return true; +} +  static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)  {  	BUG_ON(!branch_is_call(auprobe)); @@ -703,6 +713,10 @@ static const struct uprobe_xol_ops branch_xol_ops = {  	.post_xol = branch_post_xol_op,  }; +static const struct uprobe_xol_ops push_xol_ops = { +	.emulate  = push_emulate_op, +}; +  /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */  static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)  { @@ -750,6 +764,87 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)  	return 0;  } +/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */ +static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) +{ +	u8 opc1 = OPCODE1(insn), reg_offset = 0; + +	if (opc1 < 0x50 || opc1 > 0x57) +		return -ENOSYS; + +	if (insn->length > 2) +		return -ENOSYS; +	if (insn->length == 2) { +		/* only support rex_prefix 0x41 (x64 only) */ +#ifdef CONFIG_X86_64 +		if (insn->rex_prefix.nbytes != 1 || +		    insn->rex_prefix.bytes[0] != 0x41) +			return -ENOSYS; + +		switch (opc1) { +		case 0x50: +			reg_offset = offsetof(struct pt_regs, r8); +			break; +		case 0x51: +			reg_offset = offsetof(struct pt_regs, r9); +			break; +		case 0x52: +			reg_offset = offsetof(struct pt_regs, r10); +			break; +		case 0x53: +			reg_offset = offsetof(struct pt_regs, r11); +			break; +		case 0x54: +			reg_offset = offsetof(struct pt_regs, r12); +			break; +		case 0x55: +			reg_offset = offsetof(struct pt_regs, r13); +			break; +		case 0x56: +			reg_offset = offsetof(struct pt_regs, r14); +			break; +		case 0x57: +			reg_offset = offsetof(struct pt_regs, r15); +			break; +		} +#else +		return -ENOSYS; +#endif +	} else { +		switch (opc1) { +		case 0x50: +			reg_offset = offsetof(struct pt_regs, ax); +			break; +		case 0x51: +			reg_offset = offsetof(struct pt_regs, cx); +			break; +		case 0x52: +			reg_offset = offsetof(struct pt_regs, dx); +			break; +		case 0x53: +			reg_offset = offsetof(struct pt_regs, bx); +			break; +		case 0x54: +			reg_offset = offsetof(struct pt_regs, sp); +			break; +		case 0x55: +			reg_offset = offsetof(struct pt_regs, bp); +			break; +		case 0x56: +			reg_offset = offsetof(struct pt_regs, si); +			break; +		case 0x57: +			reg_offset = offsetof(struct pt_regs, di); +			break; +		} +	} + +	auprobe->push.reg_offset = reg_offset; +	auprobe->push.ilen = insn->length; +	auprobe->ops = &push_xol_ops; +	return 0; +} +  /**   * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.   * @mm: the probed address space. @@ -771,6 +866,10 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,  	if (ret != -ENOSYS)  		return ret; +	ret = push_setup_xol_ops(auprobe, &insn); +	if (ret != -ENOSYS) +		return ret; +  	/*  	 * Figure out which fixups default_post_xol_op() will need to perform,  	 * and annotate defparam->fixups accordingly. diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1e413a9326aa..9b138a06c1a4 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -124,6 +124,12 @@ SECTIONS  		ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");  #endif +#ifdef CONFIG_RETPOLINE +		__indirect_thunk_start = .; +		*(.text.__x86.indirect_thunk) +		__indirect_thunk_end = .; +#endif +  		/* End of text section */  		_etext = .;  	} :text = 0x9090 | 

