summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/Makefile8
-rw-r--r--arch/x86/kernel/cpu/amd.c106
-rw-r--r--arch/x86/kernel/cpu/bugs.c212
-rw-r--r--arch/x86/kernel/cpu/centaur.c37
-rw-r--r--arch/x86/kernel/cpu/common.c325
-rw-r--r--arch/x86/kernel/cpu/cpu.h22
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c97
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c145
-rw-r--r--arch/x86/kernel/cpu/hygon.c21
-rw-r--r--arch/x86/kernel/cpu/intel.c129
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c8
-rw-r--r--arch/x86/kernel/cpu/mce/core.c165
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c2
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c28
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h8
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c4
-rw-r--r--arch/x86/kernel/cpu/mce/therm_throt.c267
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c36
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c5
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh15
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c21
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c66
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c2
-rw-r--r--arch/x86/kernel/cpu/proc.c15
-rw-r--r--arch/x86/kernel/cpu/rdrand.c22
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c4
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h1
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c4
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c159
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/topology.c2
-rw-r--r--arch/x86/kernel/cpu/tsx.c144
-rw-r--r--arch/x86/kernel/cpu/umwait.c6
-rw-r--r--arch/x86/kernel/cpu/vmware.c96
-rw-r--r--arch/x86/kernel/cpu/zhaoxin.c37
38 files changed, 1545 insertions, 684 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d7a1e5a9331c..7dc4ad68eb41 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -29,8 +29,9 @@ obj-y += umwait.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
+obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o
ifdef CONFIG_CPU_SUP_INTEL
-obj-y += intel.o intel_pconfig.o
+obj-y += intel.o intel_pconfig.o tsx.o
obj-$(CONFIG_PM) += intel_epb.o
endif
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
@@ -53,11 +54,12 @@ obj-$(CONFIG_ACRN_GUEST) += acrn.o
ifdef CONFIG_X86_FEATURE_NAMES
quiet_cmd_mkcapflags = MKCAP $@
- cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
+ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^
cpufeature = $(src)/../../include/asm/cpufeatures.h
+vmxfeature = $(src)/../../include/asm/vmxfeatures.h
-$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
+$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
endif
targets += capflags.c
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 8d4e50428b68..ac83a0fef628 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -8,6 +8,7 @@
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/random.h>
+#include <linux/topology.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cacheinfo.h>
@@ -318,13 +319,6 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
c->cpu_core_id %= cus_per_node;
}
-
-static void amd_get_topology_early(struct cpuinfo_x86 *c)
-{
- if (cpu_has(c, X86_FEATURE_TOPOEXT))
- smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
-}
-
/*
* Fixup core topology information for
* (1) AMD multi-node processors
@@ -614,9 +608,9 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
return;
clear_all:
- clear_cpu_cap(c, X86_FEATURE_SME);
+ setup_clear_cpu_cap(X86_FEATURE_SME);
clear_sev:
- clear_cpu_cap(c, X86_FEATURE_SEV);
+ setup_clear_cpu_cap(X86_FEATURE_SEV);
}
}
@@ -716,7 +710,8 @@ static void early_init_amd(struct cpuinfo_x86 *c)
}
}
- amd_get_topology_early(c);
+ if (cpu_has(c, X86_FEATURE_TOPOEXT))
+ smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
}
static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -804,6 +799,64 @@ static void init_amd_ln(struct cpuinfo_x86 *c)
msr_set_bit(MSR_AMD64_DE_CFG, 31);
}
+static bool rdrand_force;
+
+static int __init rdrand_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "force"))
+ rdrand_force = true;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("rdrand", rdrand_cmdline);
+
+static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c)
+{
+ /*
+ * Saving of the MSR used to hide the RDRAND support during
+ * suspend/resume is done by arch/x86/power/cpu.c, which is
+ * dependent on CONFIG_PM_SLEEP.
+ */
+ if (!IS_ENABLED(CONFIG_PM_SLEEP))
+ return;
+
+ /*
+ * The nordrand option can clear X86_FEATURE_RDRAND, so check for
+ * RDRAND support using the CPUID function directly.
+ */
+ if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force)
+ return;
+
+ msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62);
+
+ /*
+ * Verify that the CPUID change has occurred in case the kernel is
+ * running virtualized and the hypervisor doesn't support the MSR.
+ */
+ if (cpuid_ecx(1) & BIT(30)) {
+ pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n");
+ return;
+ }
+
+ clear_cpu_cap(c, X86_FEATURE_RDRAND);
+ pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n");
+}
+
+static void init_amd_jg(struct cpuinfo_x86 *c)
+{
+ /*
+ * Some BIOS implementations do not restore proper RDRAND support
+ * across suspend and resume. Check on whether to hide the RDRAND
+ * instruction support via CPUID.
+ */
+ clear_rdrand_cpuid_bit(c);
+}
+
static void init_amd_bd(struct cpuinfo_x86 *c)
{
u64 value;
@@ -818,12 +871,23 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
wrmsrl_safe(MSR_F15H_IC_CFG, value);
}
}
+
+ /*
+ * Some BIOS implementations do not restore proper RDRAND support
+ * across suspend and resume. Check on whether to hide the RDRAND
+ * instruction support via CPUID.
+ */
+ clear_rdrand_cpuid_bit(c);
}
static void init_amd_zn(struct cpuinfo_x86 *c)
{
set_cpu_cap(c, X86_FEATURE_ZEN);
+#ifdef CONFIG_NUMA
+ node_reclaim_distance = 32;
+#endif
+
/*
* Fix erratum 1076: CPB feature bit not being set in CPUID.
* Always set it, except when running under a hypervisor.
@@ -860,6 +924,7 @@ static void init_amd(struct cpuinfo_x86 *c)
case 0x10: init_amd_gh(c); break;
case 0x12: init_amd_ln(c); break;
case 0x15: init_amd_bd(c); break;
+ case 0x16: init_amd_jg(c); break;
case 0x17: init_amd_zn(c); break;
}
@@ -879,12 +944,8 @@ static void init_amd(struct cpuinfo_x86 *c)
init_amd_cacheinfo(c);
if (cpu_has(c, X86_FEATURE_XMM2)) {
- unsigned long long val;
- int ret;
-
/*
- * A serializing LFENCE has less overhead than MFENCE, so
- * use it for execution serialization. On families which
+ * Use LFENCE for execution serialization. On families which
* don't have that MSR, LFENCE is already serializing.
* msr_set_bit() uses the safe accessors, too, even if the MSR
* is not present.
@@ -892,19 +953,8 @@ static void init_amd(struct cpuinfo_x86 *c)
msr_set_bit(MSR_F10H_DECFG,
MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
- /*
- * Verify that the MSR write was successful (could be running
- * under a hypervisor) and only then assume that LFENCE is
- * serializing.
- */
- ret = rdmsrl_safe(MSR_F10H_DECFG, &val);
- if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) {
- /* A serializing LFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
- } else {
- /* MFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
- }
+ /* A serializing LFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
}
/*
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c6fa3ef10b4e..ed54b3b21c39 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -39,6 +39,8 @@ static void __init spectre_v2_select_mitigation(void);
static void __init ssb_select_mitigation(void);
static void __init l1tf_select_mitigation(void);
static void __init mds_select_mitigation(void);
+static void __init mds_print_mitigation(void);
+static void __init taa_select_mitigation(void);
/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
u64 x86_spec_ctrl_base;
@@ -105,6 +107,13 @@ void __init check_bugs(void)
ssb_select_mitigation();
l1tf_select_mitigation();
mds_select_mitigation();
+ taa_select_mitigation();
+
+ /*
+ * As MDS and TAA mitigations are inter-related, print MDS
+ * mitigation until after TAA mitigation selection is done.
+ */
+ mds_print_mitigation();
arch_smt_update();
@@ -243,6 +252,12 @@ static void __init mds_select_mitigation(void)
(mds_nosmt || cpu_mitigations_auto_nosmt()))
cpu_smt_disable(false);
}
+}
+
+static void __init mds_print_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off())
+ return;
pr_info("%s\n", mds_strings[mds_mitigation]);
}
@@ -269,6 +284,120 @@ static int __init mds_cmdline(char *str)
early_param("mds", mds_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "TAA: " fmt
+
+enum taa_mitigations {
+ TAA_MITIGATION_OFF,
+ TAA_MITIGATION_UCODE_NEEDED,
+ TAA_MITIGATION_VERW,
+ TAA_MITIGATION_TSX_DISABLED,
+};
+
+/* Default mitigation for TAA-affected CPUs */
+static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;
+static bool taa_nosmt __ro_after_init;
+
+static const char * const taa_strings[] = {
+ [TAA_MITIGATION_OFF] = "Vulnerable",
+ [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+ [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled",
+};
+
+static void __init taa_select_mitigation(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has_bug(X86_BUG_TAA)) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ return;
+ }
+
+ /* TSX previously disabled by tsx=off */
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
+ taa_mitigation = TAA_MITIGATION_TSX_DISABLED;
+ goto out;
+ }
+
+ if (cpu_mitigations_off()) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ return;
+ }
+
+ /*
+ * TAA mitigation via VERW is turned off if both
+ * tsx_async_abort=off and mds=off are specified.
+ */
+ if (taa_mitigation == TAA_MITIGATION_OFF &&
+ mds_mitigation == MDS_MITIGATION_OFF)
+ goto out;
+
+ if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ taa_mitigation = TAA_MITIGATION_VERW;
+ else
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
+ * A microcode update fixes this behavior to clear CPU buffers. It also
+ * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
+ * ARCH_CAP_TSX_CTRL_MSR bit.
+ *
+ * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
+ * update is required.
+ */
+ ia32_cap = x86_read_arch_cap_msr();
+ if ( (ia32_cap & ARCH_CAP_MDS_NO) &&
+ !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * TSX is enabled, select alternate mitigation for TAA which is
+ * the same as MDS. Enable MDS static branch to clear CPU buffers.
+ *
+ * For guests that can't determine whether the correct microcode is
+ * present on host, enable the mitigation for UCODE_NEEDED as well.
+ */
+ static_branch_enable(&mds_user_clear);
+
+ if (taa_nosmt || cpu_mitigations_auto_nosmt())
+ cpu_smt_disable(false);
+
+ /*
+ * Update MDS mitigation, if necessary, as the mds_user_clear is
+ * now enabled for TAA mitigation.
+ */
+ if (mds_mitigation == MDS_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_MDS)) {
+ mds_mitigation = MDS_MITIGATION_FULL;
+ mds_select_mitigation();
+ }
+out:
+ pr_info("%s\n", taa_strings[taa_mitigation]);
+}
+
+static int __init tsx_async_abort_parse_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TAA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ taa_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "Spectre V1 : " fmt
enum spectre_v1_mitigation {
@@ -786,13 +915,10 @@ static void update_mds_branch_idle(void)
}
#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
+#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
-void arch_smt_update(void)
+void cpu_bugs_smt_update(void)
{
- /* Enhanced IBRS implies STIBP. No update required. */
- if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
- return;
-
mutex_lock(&spec_ctrl_mutex);
switch (spectre_v2_user) {
@@ -819,6 +945,17 @@ void arch_smt_update(void)
break;
}
+ switch (taa_mitigation) {
+ case TAA_MITIGATION_VERW:
+ case TAA_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(TAA_MSG_SMT);
+ break;
+ case TAA_MITIGATION_TSX_DISABLED:
+ case TAA_MITIGATION_OFF:
+ break;
+ }
+
mutex_unlock(&spec_ctrl_mutex);
}
@@ -1149,6 +1286,9 @@ void x86_spec_ctrl_setup_ap(void)
x86_amd_ssb_disable();
}
+bool itlb_multihit_kvm_mitigation;
+EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
+
#undef pr_fmt
#define pr_fmt(fmt) "L1TF: " fmt
@@ -1184,15 +1324,15 @@ static void override_cache_bits(struct cpuinfo_x86 *c)
case INTEL_FAM6_WESTMERE:
case INTEL_FAM6_SANDYBRIDGE:
case INTEL_FAM6_IVYBRIDGE:
- case INTEL_FAM6_HASWELL_CORE:
- case INTEL_FAM6_HASWELL_ULT:
- case INTEL_FAM6_HASWELL_GT3E:
- case INTEL_FAM6_BROADWELL_CORE:
- case INTEL_FAM6_BROADWELL_GT3E:
- case INTEL_FAM6_SKYLAKE_MOBILE:
- case INTEL_FAM6_SKYLAKE_DESKTOP:
- case INTEL_FAM6_KABYLAKE_MOBILE:
- case INTEL_FAM6_KABYLAKE_DESKTOP:
+ case INTEL_FAM6_HASWELL:
+ case INTEL_FAM6_HASWELL_L:
+ case INTEL_FAM6_HASWELL_G:
+ case INTEL_FAM6_BROADWELL:
+ case INTEL_FAM6_BROADWELL_G:
+ case INTEL_FAM6_SKYLAKE_L:
+ case INTEL_FAM6_SKYLAKE:
+ case INTEL_FAM6_KABYLAKE_L:
+ case INTEL_FAM6_KABYLAKE:
if (c->x86_cache_bits < 44)
c->x86_cache_bits = 44;
break;
@@ -1304,11 +1444,24 @@ static ssize_t l1tf_show_state(char *buf)
l1tf_vmx_states[l1tf_vmx_mitigation],
sched_smt_active() ? "vulnerable" : "disabled");
}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ if (itlb_multihit_kvm_mitigation)
+ return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
+ else
+ return sprintf(buf, "KVM: Vulnerable\n");
+}
#else
static ssize_t l1tf_show_state(char *buf)
{
return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ return sprintf(buf, "Processor vulnerable\n");
+}
#endif
static ssize_t mds_show_state(char *buf)
@@ -1328,6 +1481,21 @@ static ssize_t mds_show_state(char *buf)
sched_smt_active() ? "vulnerable" : "disabled");
}
+static ssize_t tsx_async_abort_show_state(char *buf)
+{
+ if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) ||
+ (taa_mitigation == TAA_MITIGATION_OFF))
+ return sprintf(buf, "%s\n", taa_strings[taa_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sprintf(buf, "%s; SMT Host state unknown\n",
+ taa_strings[taa_mitigation]);
+ }
+
+ return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
static char *stibp_state(void)
{
if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
@@ -1398,6 +1566,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_MDS:
return mds_show_state(buf);
+ case X86_BUG_TAA:
+ return tsx_async_abort_show_state(buf);
+
+ case X86_BUG_ITLB_MULTIHIT:
+ return itlb_multihit_show_state(buf);
+
default:
break;
}
@@ -1434,4 +1608,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu
{
return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
}
+
+ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_TAA);
+}
+
+ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
+}
#endif
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 14433ff5b828..426792565d86 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -18,13 +18,6 @@
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
-#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
-#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
-#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
-#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
-#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
-#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
-
static void init_c3(struct cpuinfo_x86 *c)
{
u32 lo, hi;
@@ -71,8 +64,6 @@ static void init_c3(struct cpuinfo_x86 *c)
c->x86_cache_alignment = c->x86_clflush_size * 2;
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
-
- cpu_detect_cache_sizes(c);
}
enum {
@@ -119,31 +110,6 @@ static void early_init_centaur(struct cpuinfo_x86 *c)
}
}
-static void centaur_detect_vmx_virtcap(struct cpuinfo_x86 *c)
-{
- u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
-
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
- msr_ctl = vmx_msr_high | vmx_msr_low;
-
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
- set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
- set_cpu_cap(c, X86_FEATURE_VNMI);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx_msr_low, vmx_msr_high);
- msr_ctl2 = vmx_msr_high | vmx_msr_low;
- if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
- (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
- set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
- set_cpu_cap(c, X86_FEATURE_EPT);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
- set_cpu_cap(c, X86_FEATURE_VPID);
- }
-}
-
static void init_centaur(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
@@ -250,8 +216,7 @@ static void init_centaur(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
- if (cpu_has(c, X86_FEATURE_VMX))
- centaur_detect_vmx_virtcap(c);
+ init_ia32_feat_ctl(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f125bf7ecb6f..52c9bfbbdb2a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -14,6 +14,7 @@
#include <linux/sched/mm.h>
#include <linux/sched/clock.h>
#include <linux/sched/task.h>
+#include <linux/sched/smt.h>
#include <linux/init.h>
#include <linux/kprobes.h>
#include <linux/kgdb.h>
@@ -24,6 +25,7 @@
#include <asm/stackprotector.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
+#include <asm/doublefault.h>
#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
@@ -48,15 +50,12 @@
#include <asm/cpu.h>
#include <asm/mce.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/microcode.h>
#include <asm/microcode_intel.h>
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/uv/uv.h>
-#endif
#include "cpu.h"
@@ -165,22 +164,6 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
-static int __init x86_mpx_setup(char *s)
-{
- /* require an exact match without trailing characters */
- if (strlen(s))
- return 0;
-
- /* do not emit a message if the feature is not present */
- if (!boot_cpu_has(X86_FEATURE_MPX))
- return 1;
-
- setup_clear_cpu_cap(X86_FEATURE_MPX);
- pr_info("nompx: Intel Memory Protection Extensions (MPX) disabled\n");
- return 1;
-}
-__setup("nompx", x86_mpx_setup);
-
#ifdef CONFIG_X86_64
static int __init x86_nopcid_setup(char *s)
{
@@ -307,8 +290,6 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
static __init int setup_disable_smep(char *arg)
{
setup_clear_cpu_cap(X86_FEATURE_SMEP);
- /* Check for things that depend on SMEP being enabled: */
- check_mpx_erratum(&boot_cpu_data);
return 1;
}
__setup("nosmep", setup_disable_smep);
@@ -565,8 +546,9 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
return NULL; /* Not found */
}
-__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
-__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
+/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
void load_percpu_segment(int cpu)
{
@@ -1016,13 +998,15 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#endif
}
-#define NO_SPECULATION BIT(0)
-#define NO_MELTDOWN BIT(1)
-#define NO_SSB BIT(2)
-#define NO_L1TF BIT(3)
-#define NO_MDS BIT(4)
-#define MSBDS_ONLY BIT(5)
-#define NO_SWAPGS BIT(6)
+#define NO_SPECULATION BIT(0)
+#define NO_MELTDOWN BIT(1)
+#define NO_SSB BIT(2)
+#define NO_L1TF BIT(3)
+#define NO_MDS BIT(4)
+#define MSBDS_ONLY BIT(5)
+#define NO_SWAPGS BIT(6)
+#define NO_ITLB_MULTIHIT BIT(7)
+#define NO_SPECTRE_V2 BIT(8)
#define VULNWL(_vendor, _family, _model, _whitelist) \
{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
@@ -1043,26 +1027,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
/* Intel Family 6 */
- VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION),
- VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION),
- VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION),
- VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION),
- VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION),
-
- VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
+ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
VULNWL_INTEL(CORE_YONAH, NO_SSB),
- VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
+ VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS),
- VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS),
- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS),
+ VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
/*
* Technically, swapgs isn't serializing on AMD (despite it previously
@@ -1072,15 +1057,21 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
* good enough for our purposes.
*/
+ VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT),
+
/* AMD Family 0xf - 0x12 */
- VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS),
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ /* Zhaoxin Family 7 */
+ VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS),
+ VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS),
{}
};
@@ -1091,18 +1082,31 @@ static bool __init cpu_matches(unsigned long which)
return m && !!(m->driver_data & which);
}
-static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+u64 x86_read_arch_cap_msr(void)
{
u64 ia32_cap = 0;
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+
+ return ia32_cap;
+}
+
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
+ /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
+ if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
+ setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
+
if (cpu_matches(NO_SPECULATION))
return;
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
- setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
- if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+ if (!cpu_matches(NO_SPECTRE_V2))
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
@@ -1120,6 +1124,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
if (!cpu_matches(NO_SWAPGS))
setup_force_cpu_bug(X86_BUG_SWAPGS);
+ /*
+ * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when:
+ * - TSX is supported or
+ * - TSX_CTRL is present
+ *
+ * TSX_CTRL check is needed for cases when TSX could be disabled before
+ * the kernel boot e.g. kexec.
+ * TSX_CTRL check alone is not sufficient for cases when the microcode
+ * update is not present or running as guest that don't get TSX_CTRL.
+ */
+ if (!(ia32_cap & ARCH_CAP_TAA_NO) &&
+ (cpu_has(c, X86_FEATURE_RTM) ||
+ (ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
+ setup_force_cpu_bug(X86_BUG_TAA);
+
if (cpu_matches(NO_MELTDOWN))
return;
@@ -1420,6 +1439,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
#endif
c->x86_cache_alignment = c->x86_clflush_size;
memset(&c->x86_capability, 0, sizeof(c->x86_capability));
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ memset(&c->vmx_capability, 0, sizeof(c->vmx_capability));
+#endif
generic_identify(c);
@@ -1553,6 +1575,8 @@ void __init identify_boot_cpu(void)
#endif
cpu_detect_tlb(&boot_cpu_data);
setup_cr_pinning();
+
+ tsx_init();
}
void identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -1748,7 +1772,7 @@ static void wait_for_master_cpu(int cpu)
}
#ifdef CONFIG_X86_64
-static void setup_getcpu(int cpu)
+static inline void setup_getcpu(int cpu)
{
unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
struct desc_struct d = { };
@@ -1768,7 +1792,50 @@ static void setup_getcpu(int cpu)
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
}
+
+static inline void ucode_cpu_init(int cpu)
+{
+ if (cpu)
+ load_ucode_ap();
+}
+
+static inline void tss_setup_ist(struct tss_struct *tss)
+{
+ /* Set up the per-CPU TSS IST stacks */
+ tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
+ tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
+ tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
+ tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
+}
+
+#else /* CONFIG_X86_64 */
+
+static inline void setup_getcpu(int cpu) { }
+
+static inline void ucode_cpu_init(int cpu)
+{
+ show_ucode_info_early();
+}
+
+static inline void tss_setup_ist(struct tss_struct *tss) { }
+
+#endif /* !CONFIG_X86_64 */
+
+static inline void tss_setup_io_bitmap(struct tss_struct *tss)
+{
+ tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
+
+#ifdef CONFIG_X86_IOPL_IOPERM
+ tss->io_bitmap.prev_max = 0;
+ tss->io_bitmap.prev_sequence = 0;
+ memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap));
+ /*
+ * Invalidate the extra array entry past the end of the all
+ * permission bitmap as required by the hardware.
+ */
+ tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL;
#endif
+}
/*
* cpu_init() initializes state that is per-CPU. Some data is already
@@ -1776,21 +1843,15 @@ static void setup_getcpu(int cpu)
* and IDT. We reload them nevertheless, this function acts as a
* 'CPU state barrier', nothing should get across.
*/
-#ifdef CONFIG_X86_64
-
void cpu_init(void)
{
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+ struct task_struct *cur = current;
int cpu = raw_smp_processor_id();
- struct task_struct *me;
- struct tss_struct *t;
- int i;
wait_for_master_cpu(cpu);
- if (cpu)
- load_ucode_ap();
-
- t = &per_cpu(cpu_tss_rw, cpu);
+ ucode_cpu_init(cpu);
#ifdef CONFIG_NUMA
if (this_cpu_read(numa_node) == 0 &&
@@ -1799,63 +1860,47 @@ void cpu_init(void)
#endif
setup_getcpu(cpu);
- me = current;
-
pr_debug("Initializing CPU#%d\n", cpu);
- cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) ||
+ boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE))
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
/*
* Initialize the per-CPU GDT with the boot GDT,
* and set up the GDT descriptor:
*/
-
switch_to_new_gdt(cpu);
- loadsegment(fs, 0);
-
load_current_idt();
- memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
- syscall_init();
-
- wrmsrl(MSR_FS_BASE, 0);
- wrmsrl(MSR_KERNEL_GS_BASE, 0);
- barrier();
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ loadsegment(fs, 0);
+ memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+ syscall_init();
- x86_configure_nx();
- x2apic_setup();
+ wrmsrl(MSR_FS_BASE, 0);
+ wrmsrl(MSR_KERNEL_GS_BASE, 0);
+ barrier();
- /*
- * set up and load the per-CPU TSS
- */
- if (!t->x86_tss.ist[0]) {
- t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
- t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
- t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
- t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
+ x2apic_setup();
}
- t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-
- /*
- * <= is required because the CPU will access up to
- * 8 bits beyond the end of the IO permission bitmap.
- */
- for (i = 0; i <= IO_BITMAP_LONGS; i++)
- t->io_bitmap[i] = ~0UL;
-
mmgrab(&init_mm);
- me->active_mm = &init_mm;
- BUG_ON(me->mm);
+ cur->active_mm = &init_mm;
+ BUG_ON(cur->mm);
initialize_tlbstate_and_flush();
- enter_lazy_tlb(&init_mm, me);
+ enter_lazy_tlb(&init_mm, cur);
- /*
- * Initialize the TSS. sp0 points to the entry trampoline stack
- * regardless of what task is running.
- */
+ /* Initialize the TSS. */
+ tss_setup_ist(tss);
+ tss_setup_io_bitmap(tss);
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+
load_TR_desc();
+ /*
+ * sp0 points to the entry trampoline stack regardless of what task
+ * is running.
+ */
load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
load_mm_ldt(&init_mm);
@@ -1863,6 +1908,8 @@ void cpu_init(void)
clear_all_debug_regs();
dbg_restore_debug_regs();
+ doublefault_init_cpu_tss();
+
fpu__init_cpu();
if (is_uv_system())
@@ -1871,63 +1918,6 @@ void cpu_init(void)
load_fixmap_gdt(cpu);
}
-#else
-
-void cpu_init(void)
-{
- int cpu = smp_processor_id();
- struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
-
- wait_for_master_cpu(cpu);
-
- show_ucode_info_early();
-
- pr_info("Initializing CPU#%d\n", cpu);
-
- if (cpu_feature_enabled(X86_FEATURE_VME) ||
- boot_cpu_has(X86_FEATURE_TSC) ||
- boot_cpu_has(X86_FEATURE_DE))
- cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-
- load_current_idt();
- switch_to_new_gdt(cpu);
-
- /*
- * Set up and load the per-CPU TSS and LDT
- */
- mmgrab(&init_mm);
- curr->active_mm = &init_mm;
- BUG_ON(curr->mm);
- initialize_tlbstate_and_flush();
- enter_lazy_tlb(&init_mm, curr);
-
- /*
- * Initialize the TSS. sp0 points to the entry trampoline stack
- * regardless of what task is running.
- */
- set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
- load_TR_desc();
- load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
-
- load_mm_ldt(&init_mm);
-
- t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-
-#ifdef CONFIG_DOUBLEFAULT
- /* Set up doublefault TSS pointer in the GDT */
- __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
-
- clear_all_debug_regs();
- dbg_restore_debug_regs();
-
- fpu__init_cpu();
-
- load_fixmap_gdt(cpu);
-}
-#endif
-
/*
* The microcode loader calls this upon late microcode load to recheck features,
* only when microcode has been updated. Caller holds microcode_mutex and CPU
@@ -1957,3 +1947,14 @@ void microcode_check(void)
pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n");
pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
}
+
+/*
+ * Invoked from core CPU hotplug code after hotplug operations
+ */
+void arch_smt_update(void)
+{
+ /* Handle the speculative execution misfeatures */
+ cpu_bugs_smt_update();
+ /* Check whether IPI broadcasting can be enabled */
+ apic_smt_update();
+}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index c0e2407abdd6..37fdefd14f28 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -44,6 +44,22 @@ struct _tlb_table {
extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
+#ifdef CONFIG_CPU_SUP_INTEL
+enum tsx_ctrl_states {
+ TSX_CTRL_ENABLE,
+ TSX_CTRL_DISABLE,
+ TSX_CTRL_NOT_SUPPORTED,
+};
+
+extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
+
+extern void __init tsx_init(void);
+extern void tsx_enable(void);
+extern void tsx_disable(void);
+#else
+static inline void tsx_init(void) { }
+#endif /* CONFIG_CPU_SUP_INTEL */
+
extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void get_cpu_address_sizes(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
@@ -62,4 +78,10 @@ unsigned int aperfmperf_get_khz(int cpu);
extern void x86_spec_ctrl_setup_ap(void);
+extern u64 x86_read_arch_cap_msr(void);
+
+#ifdef CONFIG_IA32_FEAT_CTL
+void init_ia32_feat_ctl(struct cpuinfo_x86 *c);
+#endif
+
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index b5353244749b..3cbe24ca80ab 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -20,54 +20,55 @@ struct cpuid_dep {
* but it's difficult to tell that to the init reference checker.
*/
static const struct cpuid_dep cpuid_deps[] = {
- { X86_FEATURE_FXSR, X86_FEATURE_FPU },
- { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
- { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
- { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
- { X86_FEATURE_AVX, X86_FEATURE_XSAVE },
- { X86_FEATURE_PKU, X86_FEATURE_XSAVE },
- { X86_FEATURE_MPX, X86_FEATURE_XSAVE },
- { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
- { X86_FEATURE_CMOV, X86_FEATURE_FXSR },
- { X86_FEATURE_MMX, X86_FEATURE_FXSR },
- { X86_FEATURE_MMXEXT, X86_FEATURE_MMX },
- { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
- { X86_FEATURE_XSAVE, X86_FEATURE_FXSR },
- { X86_FEATURE_XMM, X86_FEATURE_FXSR },
- { X86_FEATURE_XMM2, X86_FEATURE_XMM },
- { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
- { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 },
- { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 },
- { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
- { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 },
- { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, },
- { X86_FEATURE_F16C, X86_FEATURE_XMM2, },
- { X86_FEATURE_AES, X86_FEATURE_XMM2 },
- { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
- { X86_FEATURE_FMA, X86_FEATURE_AVX },
- { X86_FEATURE_AVX2, X86_FEATURE_AVX, },
- { X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
- { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
- { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
- { X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
- { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
- { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
- { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
- { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
- { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
- { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC },
- { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
- { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
- { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_FXSR, X86_FEATURE_FPU },
+ { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
+ { X86_FEATURE_AVX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_PKU, X86_FEATURE_XSAVE },
+ { X86_FEATURE_MPX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
+ { X86_FEATURE_CMOV, X86_FEATURE_FXSR },
+ { X86_FEATURE_MMX, X86_FEATURE_FXSR },
+ { X86_FEATURE_MMXEXT, X86_FEATURE_MMX },
+ { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
+ { X86_FEATURE_XSAVE, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM2, X86_FEATURE_XMM },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, },
+ { X86_FEATURE_F16C, X86_FEATURE_XMM2, },
+ { X86_FEATURE_AES, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_AVX2, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VP2INTERSECT, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
{}
};
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
new file mode 100644
index 000000000000..0268185bef94
--- /dev/null
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/tboot.h>
+
+#include <asm/cpufeature.h>
+#include <asm/msr-index.h>
+#include <asm/processor.h>
+#include <asm/vmx.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "x86/cpu: " fmt
+
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+enum vmx_feature_leafs {
+ MISC_FEATURES = 0,
+ PRIMARY_CTLS,
+ SECONDARY_CTLS,
+ NR_VMX_FEATURE_WORDS,
+};
+
+#define VMX_F(x) BIT(VMX_FEATURE_##x & 0x1f)
+
+static void init_vmx_capabilities(struct cpuinfo_x86 *c)
+{
+ u32 supported, funcs, ept, vpid, ign;
+
+ BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS);
+
+ /*
+ * The high bits contain the allowed-1 settings, i.e. features that can
+ * be turned on. The low bits contain the allowed-0 settings, i.e.
+ * features that can be turned off. Ignore the allowed-0 settings,
+ * if a feature can be turned on then it's supported.
+ *
+ * Use raw rdmsr() for primary processor controls and pin controls MSRs
+ * as they exist on any CPU that supports VMX, i.e. we want the WARN if
+ * the RDMSR faults.
+ */
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, ign, supported);
+ c->vmx_capability[PRIMARY_CTLS] = supported;
+
+ rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported);
+ c->vmx_capability[SECONDARY_CTLS] = supported;
+
+ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported);
+ rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs);
+
+ /*
+ * Except for EPT+VPID, which enumerates support for both in a single
+ * MSR, low for EPT, high for VPID.
+ */
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, &ept, &vpid);
+
+ /* Pin, EPT, VPID and VM-Func are merged into a single word. */
+ WARN_ON_ONCE(supported >> 16);
+ WARN_ON_ONCE(funcs >> 4);
+ c->vmx_capability[MISC_FEATURES] = (supported & 0xffff) |
+ ((vpid & 0x1) << 16) |
+ ((funcs & 0xf) << 28);
+
+ /* EPT bits are full on scattered and must be manually handled. */
+ if (ept & VMX_EPT_EXECUTE_ONLY_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_EXECUTE_ONLY);
+ if (ept & VMX_EPT_AD_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD);
+ if (ept & VMX_EPT_1GB_PAGE_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB);
+
+ /* Synthetic APIC features that are aggregates of multiple features. */
+ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_APIC_ACCESSES)))
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(FLEXPRIORITY);
+
+ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(APIC_REGISTER_VIRT)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_INTR_DELIVERY)) &&
+ (c->vmx_capability[MISC_FEATURES] & VMX_F(POSTED_INTR)))
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(APICV);
+
+ /* Set the synthetic cpufeatures to preserve /proc/cpuinfo's ABI. */
+ if (c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR))
+ set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(FLEXPRIORITY))
+ set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(VIRTUAL_NMIS))
+ set_cpu_cap(c, X86_FEATURE_VNMI);
+ if (c->vmx_capability[SECONDARY_CTLS] & VMX_F(EPT))
+ set_cpu_cap(c, X86_FEATURE_EPT);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(EPT_AD))
+ set_cpu_cap(c, X86_FEATURE_EPT_AD);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(VPID))
+ set_cpu_cap(c, X86_FEATURE_VPID);
+}
+#endif /* CONFIG_X86_VMX_FEATURE_NAMES */
+
+void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
+{
+ bool tboot = tboot_enabled();
+ u64 msr;
+
+ if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
+ clear_cpu_cap(c, X86_FEATURE_VMX);
+ return;
+ }
+
+ if (msr & FEAT_CTL_LOCKED)
+ goto update_caps;
+
+ /*
+ * Ignore whatever value BIOS left in the MSR to avoid enabling random
+ * features or faulting on the WRMSR.
+ */
+ msr = FEAT_CTL_LOCKED;
+
+ /*
+ * Enable VMX if and only if the kernel may do VMXON at some point,
+ * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
+ * for the kernel, e.g. using VMX to hide malicious code.
+ */
+ if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) {
+ msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
+
+ if (tboot)
+ msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
+ }
+
+ wrmsrl(MSR_IA32_FEAT_CTL, msr);
+
+update_caps:
+ set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL);
+
+ if (!cpu_has(c, X86_FEATURE_VMX))
+ return;
+
+ if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) ||
+ (!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) {
+ if (IS_ENABLED(CONFIG_KVM_INTEL))
+ pr_err_once("VMX (%s TXT) disabled by BIOS\n",
+ tboot ? "inside" : "outside");
+ clear_cpu_cap(c, X86_FEATURE_VMX);
+ } else {
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ init_vmx_capabilities(c);
+#endif
+ }
+}
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 415621ddb8a2..4e28c1fc8749 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -330,12 +330,8 @@ static void init_hygon(struct cpuinfo_x86 *c)
init_hygon_cacheinfo(c);
if (cpu_has(c, X86_FEATURE_XMM2)) {
- unsigned long long val;
- int ret;
-
/*
- * A serializing LFENCE has less overhead than MFENCE, so
- * use it for execution serialization. On families which
+ * Use LFENCE for execution serialization. On families which
* don't have that MSR, LFENCE is already serializing.
* msr_set_bit() uses the safe accessors, too, even if the MSR
* is not present.
@@ -343,19 +339,8 @@ static void init_hygon(struct cpuinfo_x86 *c)
msr_set_bit(MSR_F10H_DECFG,
MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
- /*
- * Verify that the MSR write was successful (could be running
- * under a hypervisor) and only then assume that LFENCE is
- * serializing.
- */
- ret = rdmsrl_safe(MSR_F10H_DECFG, &val);
- if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) {
- /* A serializing LFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
- } else {
- /* MFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
- }
+ /* A serializing LFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
}
/*
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 8d6d92ebeb54..be82cd5841c3 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -32,41 +32,6 @@
#endif
/*
- * Just in case our CPU detection goes bad, or you have a weird system,
- * allow a way to override the automatic disabling of MPX.
- */
-static int forcempx;
-
-static int __init forcempx_setup(char *__unused)
-{
- forcempx = 1;
-
- return 1;
-}
-__setup("intel-skd-046-workaround=disable", forcempx_setup);
-
-void check_mpx_erratum(struct cpuinfo_x86 *c)
-{
- if (forcempx)
- return;
- /*
- * Turn off the MPX feature on CPUs where SMEP is not
- * available or disabled.
- *
- * Works around Intel Erratum SKD046: "Branch Instructions
- * May Initialize MPX Bound Registers Incorrectly".
- *
- * This might falsely disable MPX on systems without
- * SMEP, like Atom processors without SMEP. But there
- * is no such hardware known at the moment.
- */
- if (cpu_has(c, X86_FEATURE_MPX) && !cpu_has(c, X86_FEATURE_SMEP)) {
- setup_clear_cpu_cap(X86_FEATURE_MPX);
- pr_warn("x86/mpx: Disabling MPX since SMEP not present\n");
- }
-}
-
-/*
* Processors which have self-snooping capability can handle conflicting
* memory type across CPUs by snooping its own cache. However, there exists
* CPU models in which having conflicting memory types still leads to
@@ -142,21 +107,21 @@ struct sku_microcode {
u32 microcode;
};
static const struct sku_microcode spectre_bad_microcodes[] = {
- { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x80 },
- { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x80 },
- { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x80 },
- { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x80 },
- { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x80 },
+ { INTEL_FAM6_KABYLAKE, 0x0B, 0x80 },
+ { INTEL_FAM6_KABYLAKE, 0x0A, 0x80 },
+ { INTEL_FAM6_KABYLAKE, 0x09, 0x80 },
+ { INTEL_FAM6_KABYLAKE_L, 0x0A, 0x80 },
+ { INTEL_FAM6_KABYLAKE_L, 0x09, 0x80 },
{ INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
{ INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
- { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 },
- { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b },
- { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 },
- { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 },
+ { INTEL_FAM6_BROADWELL, 0x04, 0x28 },
+ { INTEL_FAM6_BROADWELL_G, 0x01, 0x1b },
+ { INTEL_FAM6_BROADWELL_D, 0x02, 0x14 },
+ { INTEL_FAM6_BROADWELL_D, 0x03, 0x07000011 },
{ INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 },
- { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 },
- { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 },
- { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 },
+ { INTEL_FAM6_HASWELL_L, 0x01, 0x21 },
+ { INTEL_FAM6_HASWELL_G, 0x01, 0x18 },
+ { INTEL_FAM6_HASWELL, 0x03, 0x23 },
{ INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
{ INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
{ INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
@@ -265,9 +230,10 @@ static void early_init_intel(struct cpuinfo_x86 *c)
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
if (c->x86 == 6) {
switch (c->x86_model) {
- case 0x27: /* Penwell */
- case 0x35: /* Cloverview */
- case 0x4a: /* Merrifield */
+ case INTEL_FAM6_ATOM_SALTWELL_MID:
+ case INTEL_FAM6_ATOM_SALTWELL_TABLET:
+ case INTEL_FAM6_ATOM_SILVERMONT_MID:
+ case INTEL_FAM6_ATOM_AIRMONT_NP:
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
break;
default:
@@ -329,7 +295,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
}
- check_mpx_erratum(c);
check_memory_type_self_snoop_errata(c);
/*
@@ -493,52 +458,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
-static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
-{
- /* Intel VMX MSR indicated features */
-#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
-#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
-#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
-#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
-#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
-#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
-#define x86_VMX_FEATURE_EPT_CAP_AD 0x00200000
-
- u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
- u32 msr_vpid_cap, msr_ept_cap;
-
- clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- clear_cpu_cap(c, X86_FEATURE_VNMI);
- clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- clear_cpu_cap(c, X86_FEATURE_EPT);
- clear_cpu_cap(c, X86_FEATURE_VPID);
- clear_cpu_cap(c, X86_FEATURE_EPT_AD);
-
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
- msr_ctl = vmx_msr_high | vmx_msr_low;
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
- set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
- set_cpu_cap(c, X86_FEATURE_VNMI);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx_msr_low, vmx_msr_high);
- msr_ctl2 = vmx_msr_high | vmx_msr_low;
- if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
- (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
- set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) {
- set_cpu_cap(c, X86_FEATURE_EPT);
- rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
- msr_ept_cap, msr_vpid_cap);
- if (msr_ept_cap & x86_VMX_FEATURE_EPT_CAP_AD)
- set_cpu_cap(c, X86_FEATURE_EPT_AD);
- }
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
- set_cpu_cap(c, X86_FEATURE_VPID);
- }
-}
-
#define MSR_IA32_TME_ACTIVATE 0x982
/* Helpers to access TME_ACTIVATE MSR */
@@ -754,13 +673,17 @@ static void init_intel(struct cpuinfo_x86 *c)
/* Work around errata */
srat_detect_node(c);
- if (cpu_has(c, X86_FEATURE_VMX))
- detect_vmx_virtcap(c);
+ init_ia32_feat_ctl(c);
if (cpu_has(c, X86_FEATURE_TME))
detect_tme(c);
init_intel_misc_features(c);
+
+ if (tsx_ctrl_state == TSX_CTRL_ENABLE)
+ tsx_enable();
+ if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+ tsx_disable();
}
#ifdef CONFIG_X86_32
@@ -813,7 +736,7 @@ static const struct _tlb_table intel_tlb_table[] = {
{ 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
{ 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
{ 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
- { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
+ { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" },
{ 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
@@ -841,7 +764,7 @@ static const struct _tlb_table intel_tlb_table[] = {
{ 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
{ 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
{ 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
- { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" },
+ { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" },
{ 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
{ 0x00, 0, 0 }
};
@@ -853,8 +776,8 @@ static void intel_tlb_lookup(const unsigned char desc)
return;
/* look up this descriptor in the table */
- for (k = 0; intel_tlb_table[k].descriptor != desc && \
- intel_tlb_table[k].descriptor != 0; k++)
+ for (k = 0; intel_tlb_table[k].descriptor != desc &&
+ intel_tlb_table[k].descriptor != 0; k++)
;
if (intel_tlb_table[k].tlb_type == 0)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 6ea7fdc82f3c..b3a50d962851 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -78,6 +78,7 @@ struct smca_bank_name {
static struct smca_bank_name smca_names[] = {
[SMCA_LS] = { "load_store", "Load Store Unit" },
+ [SMCA_LS_V2] = { "load_store", "Load Store Unit" },
[SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
[SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
[SMCA_DE] = { "decode_unit", "Decode Unit" },
@@ -138,6 +139,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* ZN Core (HWID=0xB0) MCA types */
{ SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFFF },
+ { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10), 0xFFFFFF },
{ SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
{ SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF },
{ SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF },
@@ -266,10 +268,10 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
smca_set_misc_banks_map(bank, cpu);
/* Return early if this bank was already initialized. */
- if (smca_banks[bank].hwid)
+ if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0)
return;
- if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
+ if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
return;
}
@@ -583,7 +585,7 @@ bool amd_filter_mce(struct mce *m)
* - Prevent possible spurious interrupts from the IF bank on Family 0x17
* Models 0x10-0x2F due to Erratum #1114.
*/
-void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
+static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
{
int i, num_msrs;
u64 hwcr;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 743370ee4983..2c4f949611e4 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -53,8 +53,6 @@
#include "internal.h"
-static DEFINE_MUTEX(mce_log_mutex);
-
/* sysfs synchronization */
static DEFINE_MUTEX(mce_sysfs_mutex);
@@ -156,19 +154,10 @@ void mce_log(struct mce *m)
if (!mce_gen_pool_add(m))
irq_work_queue(&mce_irq_work);
}
-
-void mce_inject_log(struct mce *m)
-{
- mutex_lock(&mce_log_mutex);
- mce_log(m);
- mutex_unlock(&mce_log_mutex);
-}
-EXPORT_SYMBOL_GPL(mce_inject_log);
-
-static struct notifier_block mce_srao_nb;
+EXPORT_SYMBOL_GPL(mce_log);
/*
- * We run the default notifier if we have only the SRAO, the first and the
+ * We run the default notifier if we have only the UC, the first and the
* default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
* notifiers registered on the chain.
*/
@@ -488,8 +477,9 @@ int mce_usable_address(struct mce *m)
if (!(m->status & MCI_STATUS_ADDRV))
return 0;
- /* Checks after this one are Intel-specific: */
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ /* Checks after this one are Intel/Zhaoxin-specific: */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
return 1;
if (!(m->status & MCI_STATUS_MISCV))
@@ -507,10 +497,13 @@ EXPORT_SYMBOL_GPL(mce_usable_address);
bool mce_is_memory_error(struct mce *m)
{
- if (m->cpuvendor == X86_VENDOR_AMD ||
- m->cpuvendor == X86_VENDOR_HYGON) {
+ switch (m->cpuvendor) {
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_HYGON:
return amd_mce_is_memory_error(m);
- } else if (m->cpuvendor == X86_VENDOR_INTEL) {
+
+ case X86_VENDOR_INTEL:
+ case X86_VENDOR_ZHAOXIN:
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
@@ -527,9 +520,10 @@ bool mce_is_memory_error(struct mce *m)
return (m->status & 0xef80) == BIT(7) ||
(m->status & 0xef00) == BIT(8) ||
(m->status & 0xeffc) == 0xc;
- }
- return false;
+ default:
+ return false;
+ }
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);
@@ -589,26 +583,29 @@ static struct notifier_block first_nb = {
.priority = MCE_PRIO_FIRST,
};
-static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
- void *data)
+static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
{
struct mce *mce = (struct mce *)data;
unsigned long pfn;
- if (!mce)
+ if (!mce || !mce_usable_address(mce))
return NOTIFY_DONE;
- if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
- pfn = mce->addr >> PAGE_SHIFT;
- if (!memory_failure(pfn, 0))
- set_mce_nospec(pfn);
- }
+ if (mce->severity != MCE_AO_SEVERITY &&
+ mce->severity != MCE_DEFERRED_SEVERITY)
+ return NOTIFY_DONE;
+
+ pfn = mce->addr >> PAGE_SHIFT;
+ if (!memory_failure(pfn, 0))
+ set_mce_nospec(pfn);
return NOTIFY_OK;
}
-static struct notifier_block mce_srao_nb = {
- .notifier_call = srao_decode_notifier,
- .priority = MCE_PRIO_SRAO,
+
+static struct notifier_block mce_uc_nb = {
+ .notifier_call = uc_decode_notifier,
+ .priority = MCE_PRIO_UC,
};
static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
@@ -758,26 +755,22 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
log_it:
error_seen = true;
- mce_read_aux(&m, i);
+ if (flags & MCP_DONTLOG)
+ goto clear_it;
+ mce_read_aux(&m, i);
m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
-
/*
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
- mce_log(&m);
- else if (mce_usable_address(&m)) {
- /*
- * Although we skipped logging this, we still want
- * to take action. Add to the pool so the registered
- * notifiers will see it.
- */
- if (!mce_gen_pool_add(&m))
- mce_schedule_work();
- }
+ if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
+ goto clear_it;
+
+ mce_log(&m);
+
+clear_it:
/*
* Clear state for this bank.
*/
@@ -802,7 +795,7 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
struct pt_regs *regs)
{
- char *tmp;
+ char *tmp = *msg;
int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
@@ -814,8 +807,8 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
if (quirk_no_way_out)
quirk_no_way_out(i, m, regs);
+ m->bank = i;
if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
- m->bank = i;
mce_read_aux(m, i);
*msg = tmp;
return 1;
@@ -1127,6 +1120,12 @@ static bool __mc_check_crashing_cpu(int cpu)
u64 mcgstatus;
mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
+ if (mcgstatus & MCG_STATUS_LMCES)
+ return false;
+ }
+
if (mcgstatus & MCG_STATUS_RIPV) {
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
return true;
@@ -1221,8 +1220,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
struct mca_config *cfg = &mca_cfg;
int cpu = smp_processor_id();
- char *msg = "Unknown";
struct mce m, *final;
+ char *msg = NULL;
int worst = 0;
/*
@@ -1277,9 +1276,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
/*
* Check if this MCE is signaled to only this logical processor,
- * on Intel only.
+ * on Intel, Zhaoxin only.
*/
- if (m.cpuvendor == X86_VENDOR_INTEL)
+ if (m.cpuvendor == X86_VENDOR_INTEL ||
+ m.cpuvendor == X86_VENDOR_ZHAOXIN)
lmce = m.mcgstatus & MCG_STATUS_LMCES;
/*
@@ -1353,7 +1353,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
ist_end_non_atomic();
} else {
if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
- mce_panic("Failed kernel mode recovery", &m, NULL);
+ mce_panic("Failed kernel mode recovery", &m, msg);
}
out_ist:
@@ -1697,6 +1697,18 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 6 && c->x86_model == 45)
quirk_no_way_out = quirk_sandybridge_ifu;
}
+
+ if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
+ /*
+ * All newer Zhaoxin CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = USEC_PER_SEC;
+ }
+ }
+
if (cfg->monarch_timeout < 0)
cfg->monarch_timeout = 0;
if (cfg->bootlog != 0)
@@ -1760,6 +1772,35 @@ static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
}
}
+static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+
+ /*
+ * These CPUs have MCA bank 8 which reports only one error type called
+ * SVAD (System View Address Decoder). The reporting of that error is
+ * controlled by IA32_MC8.CTL.0.
+ *
+ * If enabled, prefetching on these CPUs will cause SVAD MCE when
+ * virtual machines start and result in a system panic. Always disable
+ * bank 8 SVAD error by default.
+ */
+ if ((c->x86 == 7 && c->x86_model == 0x1b) ||
+ (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (this_cpu_read(mce_num_banks) > 8)
+ mce_banks[8].ctl = 0;
+ }
+
+ intel_init_cmci();
+ intel_init_lmce();
+ mce_adjust_timer = cmci_intel_adjust_timer;
+}
+
+static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
+{
+ intel_clear_lmce();
+}
+
static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
{
switch (c->x86_vendor) {
@@ -1781,6 +1822,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
mce_centaur_feature_init(c);
break;
+ case X86_VENDOR_ZHAOXIN:
+ mce_zhaoxin_feature_init(c);
+ break;
+
default:
break;
}
@@ -1792,6 +1837,11 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
case X86_VENDOR_INTEL:
mce_intel_feature_clear(c);
break;
+
+ case X86_VENDOR_ZHAOXIN:
+ mce_zhaoxin_feature_clear(c);
+ break;
+
default:
break;
}
@@ -1979,7 +2029,7 @@ int __init mcheck_init(void)
{
mcheck_intel_therm_init();
mce_register_decode_chain(&first_nb);
- mce_register_decode_chain(&mce_srao_nb);
+ mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();
@@ -2014,15 +2064,16 @@ static void mce_disable_error_reporting(void)
static void vendor_disable_error_reporting(void)
{
/*
- * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs
- * are socket-wide.
- * Disabling them for just a single offlined CPU is bad, since it will
- * inhibit reporting for all shared resources on the socket like the
- * last level cache (LLC), the integrated memory controller (iMC), etc.
+ * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
+ * MSRs are socket-wide. Disabling them for just a single offlined CPU
+ * is bad, since it will inhibit reporting for all shared resources on
+ * the socket like the last level cache (LLC), the integrated memory
+ * controller (iMC), etc.
*/
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
- boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
return;
mce_disable_error_reporting();
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 1f30117b24ba..3413b41b8d55 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -494,7 +494,7 @@ static void do_inject(void)
i_mce.status |= MCI_STATUS_SYNDV;
if (inj_type == SW_INJ) {
- mce_inject_log(&i_mce);
+ mce_log(&i_mce);
return;
}
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index e43eb6732630..5627b1091b85 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -85,8 +85,10 @@ static int cmci_supported(int *banks)
* initialization is vendor keyed and this
* makes sure none of the backdoors are entered otherwise.
*/
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
return 0;
+
if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
return 0;
rdmsrl(MSR_IA32_MCG_CAP, cap);
@@ -113,15 +115,16 @@ static bool lmce_supported(void)
/*
* BIOS should indicate support for LMCE by setting bit 20 in
- * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will
- * generate a #GP fault.
+ * IA32_FEAT_CTL without which touching MCG_EXT_CTL will generate a #GP
+ * fault. The MSR must also be locked for LMCE_ENABLED to take effect.
+ * WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally
+ * locks the MSR in the event that it wasn't already locked by BIOS.
*/
- rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp);
- if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) ==
- (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE))
- return true;
+ rdmsrl(MSR_IA32_FEAT_CTL, tmp);
+ if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED)))
+ return false;
- return false;
+ return tmp & FEAT_CTL_LMCE_ENABLED;
}
bool mce_intel_cmci_poll(void)
@@ -423,7 +426,7 @@ void cmci_disable_bank(int bank)
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
-static void intel_init_cmci(void)
+void intel_init_cmci(void)
{
int banks;
@@ -442,7 +445,7 @@ static void intel_init_cmci(void)
cmci_recheck();
}
-static void intel_init_lmce(void)
+void intel_init_lmce(void)
{
u64 val;
@@ -455,7 +458,7 @@ static void intel_init_lmce(void)
wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
}
-static void intel_clear_lmce(void)
+void intel_clear_lmce(void)
{
u64 val;
@@ -479,9 +482,10 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
switch (c->x86_model) {
case INTEL_FAM6_IVYBRIDGE_X:
case INTEL_FAM6_HASWELL_X:
- case INTEL_FAM6_BROADWELL_XEON_D:
+ case INTEL_FAM6_BROADWELL_D:
case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_ICELAKE_X:
case INTEL_FAM6_XEON_PHI_KNL:
case INTEL_FAM6_XEON_PHI_KNM:
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 43031db429d2..b785c0d0b590 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -45,11 +45,17 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval);
bool mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
+void intel_init_cmci(void);
+void intel_init_lmce(void);
+void intel_clear_lmce(void);
#else
# define cmci_intel_adjust_timer mce_adjust_timer_default
static inline bool mce_intel_cmci_poll(void) { return false; }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
+static inline void intel_init_cmci(void) { }
+static inline void intel_init_lmce(void) { }
+static inline void intel_clear_lmce(void) { }
#endif
void mce_timer_kick(unsigned long interval);
@@ -78,8 +84,6 @@ static inline int apei_clear_mce(u64 record_id)
}
#endif
-void mce_inject_log(struct mce *m);
-
/*
* We consider records to be equivalent if bank+status+addr+misc all match.
* This is only used when the system is going down because of a fatal error
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 210f1f5db5f7..87bcdc6dc2f0 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -107,11 +107,11 @@ static struct severity {
*/
MCESEV(
AO, "Action optional: memory scrubbing error",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
+ SER, MASK(MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
),
MCESEV(
AO, "Action optional: last level cache writeback error",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
+ SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
),
/* ignore OVER for UCNA */
diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c
index 6e2becf547c5..58b4ee3cda77 100644
--- a/arch/x86/kernel/cpu/mce/therm_throt.c
+++ b/arch/x86/kernel/cpu/mce/therm_throt.c
@@ -40,15 +40,58 @@
#define THERMAL_THROTTLING_EVENT 0
#define POWER_LIMIT_EVENT 1
-/*
- * Current thermal event state:
+/**
+ * struct _thermal_state - Represent the current thermal event state
+ * @next_check: Stores the next timestamp, when it is allowed
+ * to log the next warning message.
+ * @last_interrupt_time: Stores the timestamp for the last threshold
+ * high event.
+ * @therm_work: Delayed workqueue structure
+ * @count: Stores the current running count for thermal
+ * or power threshold interrupts.
+ * @last_count: Stores the previous running count for thermal
+ * or power threshold interrupts.
+ * @max_time_ms: This shows the maximum amount of time CPU was
+ * in throttled state for a single thermal
+ * threshold high to low state.
+ * @total_time_ms: This is a cumulative time during which CPU was
+ * in the throttled state.
+ * @rate_control_active: Set when a throttling message is logged.
+ * This is used for the purpose of rate-control.
+ * @new_event: Stores the last high/low status of the
+ * THERM_STATUS_PROCHOT or
+ * THERM_STATUS_POWER_LIMIT.
+ * @level: Stores whether this _thermal_state instance is
+ * for a CORE level or for PACKAGE level.
+ * @sample_index: Index for storing the next sample in the buffer
+ * temp_samples[].
+ * @sample_count: Total number of samples collected in the buffer
+ * temp_samples[].
+ * @average: The last moving average of temperature samples
+ * @baseline_temp: Temperature at which thermal threshold high
+ * interrupt was generated.
+ * @temp_samples: Storage for temperature samples to calculate
+ * moving average.
+ *
+ * This structure is used to represent data related to thermal state for a CPU.
+ * There is a separate storage for core and package level for each CPU.
*/
struct _thermal_state {
- bool new_event;
- int event;
u64 next_check;
+ u64 last_interrupt_time;
+ struct delayed_work therm_work;
unsigned long count;
unsigned long last_count;
+ unsigned long max_time_ms;
+ unsigned long total_time_ms;
+ bool rate_control_active;
+ bool new_event;
+ u8 level;
+ u8 sample_index;
+ u8 sample_count;
+ u8 average;
+ u8 baseline_temp;
+ u8 temp_samples[3];
};
struct thermal_state {
@@ -121,8 +164,22 @@ define_therm_throt_device_one_ro(package_throttle_count);
define_therm_throt_device_show_func(package_power_limit, count);
define_therm_throt_device_one_ro(package_power_limit_count);
+define_therm_throt_device_show_func(core_throttle, max_time_ms);
+define_therm_throt_device_one_ro(core_throttle_max_time_ms);
+
+define_therm_throt_device_show_func(package_throttle, max_time_ms);
+define_therm_throt_device_one_ro(package_throttle_max_time_ms);
+
+define_therm_throt_device_show_func(core_throttle, total_time_ms);
+define_therm_throt_device_one_ro(core_throttle_total_time_ms);
+
+define_therm_throt_device_show_func(package_throttle, total_time_ms);
+define_therm_throt_device_one_ro(package_throttle_total_time_ms);
+
static struct attribute *thermal_throttle_attrs[] = {
&dev_attr_core_throttle_count.attr,
+ &dev_attr_core_throttle_max_time_ms.attr,
+ &dev_attr_core_throttle_total_time_ms.attr,
NULL
};
@@ -135,6 +192,112 @@ static const struct attribute_group thermal_attr_group = {
#define CORE_LEVEL 0
#define PACKAGE_LEVEL 1
+#define THERM_THROT_POLL_INTERVAL HZ
+#define THERM_STATUS_PROCHOT_LOG BIT(1)
+
+#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15))
+#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11))
+
+static void clear_therm_status_log(int level)
+{
+ int msr;
+ u64 mask, msr_val;
+
+ if (level == CORE_LEVEL) {
+ msr = MSR_IA32_THERM_STATUS;
+ mask = THERM_STATUS_CLEAR_CORE_MASK;
+ } else {
+ msr = MSR_IA32_PACKAGE_THERM_STATUS;
+ mask = THERM_STATUS_CLEAR_PKG_MASK;
+ }
+
+ rdmsrl(msr, msr_val);
+ msr_val &= mask;
+ wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG);
+}
+
+static void get_therm_status(int level, bool *proc_hot, u8 *temp)
+{
+ int msr;
+ u64 msr_val;
+
+ if (level == CORE_LEVEL)
+ msr = MSR_IA32_THERM_STATUS;
+ else
+ msr = MSR_IA32_PACKAGE_THERM_STATUS;
+
+ rdmsrl(msr, msr_val);
+ if (msr_val & THERM_STATUS_PROCHOT_LOG)
+ *proc_hot = true;
+ else
+ *proc_hot = false;
+
+ *temp = (msr_val >> 16) & 0x7F;
+}
+
+static void __maybe_unused throttle_active_work(struct work_struct *work)
+{
+ struct _thermal_state *state = container_of(to_delayed_work(work),
+ struct _thermal_state, therm_work);
+ unsigned int i, avg, this_cpu = smp_processor_id();
+ u64 now = get_jiffies_64();
+ bool hot;
+ u8 temp;
+
+ get_therm_status(state->level, &hot, &temp);
+ /* temperature value is offset from the max so lesser means hotter */
+ if (!hot && temp > state->baseline_temp) {
+ if (state->rate_control_active)
+ pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n",
+ this_cpu,
+ state->level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
+
+ state->rate_control_active = false;
+ return;
+ }
+
+ if (time_before64(now, state->next_check) &&
+ state->rate_control_active)
+ goto re_arm;
+
+ state->next_check = now + CHECK_INTERVAL;
+
+ if (state->count != state->last_count) {
+ /* There was one new thermal interrupt */
+ state->last_count = state->count;
+ state->average = 0;
+ state->sample_count = 0;
+ state->sample_index = 0;
+ }
+
+ state->temp_samples[state->sample_index] = temp;
+ state->sample_count++;
+ state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples);
+ if (state->sample_count < ARRAY_SIZE(state->temp_samples))
+ goto re_arm;
+
+ avg = 0;
+ for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i)
+ avg += state->temp_samples[i];
+
+ avg /= ARRAY_SIZE(state->temp_samples);
+
+ if (state->average > avg) {
+ pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n",
+ this_cpu,
+ state->level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
+ state->rate_control_active = true;
+ }
+
+ state->average = avg;
+
+re_arm:
+ clear_therm_status_log(state->level);
+ schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
+}
+
/***
* therm_throt_process - Process thermal throttling event from interrupt
* @curr: Whether the condition is current or not (boolean), since the
@@ -178,27 +341,33 @@ static void therm_throt_process(bool new_event, int event, int level)
if (new_event)
state->count++;
- if (time_before64(now, state->next_check) &&
- state->count != state->last_count)
+ if (event != THERMAL_THROTTLING_EVENT)
return;
- state->next_check = now + CHECK_INTERVAL;
- state->last_count = state->count;
+ if (new_event && !state->last_interrupt_time) {
+ bool hot;
+ u8 temp;
+
+ get_therm_status(state->level, &hot, &temp);
+ /*
+ * Ignore short temperature spike as the system is not close
+ * to PROCHOT. 10C offset is large enough to ignore. It is
+ * already dropped from the high threshold temperature.
+ */
+ if (temp > 10)
+ return;
- /* if we just entered the thermal event */
- if (new_event) {
- if (event == THERMAL_THROTTLING_EVENT)
- pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
- this_cpu,
- level == CORE_LEVEL ? "Core" : "Package",
- state->count);
- return;
- }
- if (old_event) {
- if (event == THERMAL_THROTTLING_EVENT)
- pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
- level == CORE_LEVEL ? "Core" : "Package");
- return;
+ state->baseline_temp = temp;
+ state->last_interrupt_time = now;
+ schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
+ } else if (old_event && state->last_interrupt_time) {
+ unsigned long throttle_time;
+
+ throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time);
+ if (throttle_time > state->max_time_ms)
+ state->max_time_ms = throttle_time;
+ state->total_time_ms += throttle_time;
+ state->last_interrupt_time = 0;
}
}
@@ -244,20 +413,47 @@ static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
if (err)
return err;
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_core_power_limit_count.attr,
thermal_attr_group.name);
+ if (err)
+ goto del_group;
+ }
+
if (cpu_has(c, X86_FEATURE_PTS)) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_throttle_count.attr,
thermal_attr_group.name);
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ if (err)
+ goto del_group;
+
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_throttle_max_time_ms.attr,
+ thermal_attr_group.name);
+ if (err)
+ goto del_group;
+
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_throttle_total_time_ms.attr,
+ thermal_attr_group.name);
+ if (err)
+ goto del_group;
+
+ if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_power_limit_count.attr,
thermal_attr_group.name);
+ if (err)
+ goto del_group;
+ }
}
+ return 0;
+
+del_group:
+ sysfs_remove_group(&dev->kobj, &thermal_attr_group);
+
return err;
}
@@ -269,15 +465,34 @@ static void thermal_throttle_remove_dev(struct device *dev)
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int thermal_throttle_online(unsigned int cpu)
{
+ struct thermal_state *state = &per_cpu(thermal_state, cpu);
struct device *dev = get_cpu_device(cpu);
+ u32 l;
+
+ state->package_throttle.level = PACKAGE_LEVEL;
+ state->core_throttle.level = CORE_LEVEL;
+
+ INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work);
+ INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work);
+
+ /* Unmask the thermal vector after the above workqueues are initialized. */
+ l = apic_read(APIC_LVTTHMR);
+ apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
return thermal_throttle_add_dev(dev, cpu);
}
static int thermal_throttle_offline(unsigned int cpu)
{
+ struct thermal_state *state = &per_cpu(thermal_state, cpu);
struct device *dev = get_cpu_device(cpu);
+ cancel_delayed_work(&state->package_throttle.therm_work);
+ cancel_delayed_work(&state->core_throttle.therm_work);
+
+ state->package_throttle.rate_control_active = false;
+ state->core_throttle.rate_control_active = false;
+
thermal_throttle_remove_dev(dev);
return 0;
}
@@ -512,10 +727,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
- /* Unmask the thermal vector: */
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-
pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
tm2 ? "TM2" : "TM1");
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index a0e52bd00ecc..3f6b137ef4e6 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -567,7 +567,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
void reload_ucode_amd(void)
{
struct microcode_amd *mc;
- u32 rev, dummy;
+ u32 rev, dummy __always_unused;
mc = (struct microcode_amd *)amd_ucode_patch;
@@ -673,7 +673,7 @@ static enum ucode_state apply_microcode_amd(int cpu)
struct ucode_cpu_info *uci;
struct ucode_patch *p;
enum ucode_state ret;
- u32 rev, dummy;
+ u32 rev, dummy __always_unused;
BUG_ON(raw_smp_processor_id() != cpu);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index cb0fdcaf1415..7019d4b2df0c 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -63,11 +63,6 @@ LIST_HEAD(microcode_cache);
*/
static DEFINE_MUTEX(microcode_mutex);
-/*
- * Serialize late loading so that CPUs get updated one-by-one.
- */
-static DEFINE_RAW_SPINLOCK(update_lock);
-
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
struct cpu_info_ctx {
@@ -566,11 +561,18 @@ static int __reload_late(void *info)
if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
return -1;
- raw_spin_lock(&update_lock);
- apply_microcode_local(&err);
- raw_spin_unlock(&update_lock);
+ /*
+ * On an SMT system, it suffices to load the microcode on one sibling of
+ * the core because the microcode engine is shared between the threads.
+ * Synchronization still needs to take place so that no concurrent
+ * loading attempts happen on multiple threads of an SMT core. See
+ * below.
+ */
+ if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu)
+ apply_microcode_local(&err);
+ else
+ goto wait_for_siblings;
- /* siblings return UCODE_OK because their engine got updated already */
if (err > UCODE_NFOUND) {
pr_warn("Error reloading microcode on CPU %d\n", cpu);
ret = -1;
@@ -578,14 +580,18 @@ static int __reload_late(void *info)
ret = 1;
}
+wait_for_siblings:
+ if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC))
+ panic("Timeout during microcode update!\n");
+
/*
- * Increase the wait timeout to a safe value here since we're
- * serializing the microcode update and that could take a while on a
- * large number of CPUs. And that is fine as the *actual* timeout will
- * be determined by the last CPU finished updating and thus cut short.
+ * At least one thread has completed update on each core.
+ * For others, simply call the update to make sure the
+ * per-cpu cpuinfo can be updated with right microcode
+ * revision.
*/
- if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus()))
- panic("Timeout during microcode update!\n");
+ if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu)
+ apply_microcode_local(&err);
return ret;
}
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index ce799cfe9434..6a99535d7f37 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -791,6 +791,7 @@ static enum ucode_state apply_microcode_intel(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
struct microcode_intel *mc;
enum ucode_state ret;
static int prev_rev;
@@ -836,7 +837,7 @@ static enum ucode_state apply_microcode_intel(int cpu)
return UCODE_ERROR;
}
- if (rev != prev_rev) {
+ if (bsp && rev != prev_rev) {
pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
rev,
mc->hdr.date & 0xffff,
@@ -852,7 +853,7 @@ out:
c->microcode = rev;
/* Update boot_cpu_data's revision too, if we're on the BSP: */
- if (c->cpu_index == boot_cpu_data.cpu_index)
+ if (bsp)
boot_cpu_data.microcode = rev;
return ret;
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index aed45b8895d5..1db560ed2ca3 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -6,8 +6,7 @@
set -e
-IN=$1
-OUT=$2
+OUT=$1
dump_array()
{
@@ -15,6 +14,7 @@ dump_array()
SIZE=$2
PFX=$3
POSTFIX=$4
+ IN=$5
PFX_SZ=$(echo $PFX | wc -c)
TABS="$(printf '\t\t\t\t\t')"
@@ -57,11 +57,18 @@ trap 'rm "$OUT"' EXIT
echo "#endif"
echo ""
- dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" ""
+ dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" "" $2
echo ""
- dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32"
+ dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32" $2
+ echo ""
+ echo "#ifdef CONFIG_X86_VMX_FEATURE_NAMES"
+ echo "#ifndef _ASM_X86_VMXFEATURES_H"
+ echo "#include <asm/vmxfeatures.h>"
+ echo "#endif"
+ dump_array "x86_vmx_flags" "NVMXINTS*32" "VMX_FEATURE_" "" $3
+ echo "#endif /* CONFIG_X86_VMX_FEATURE_NAMES */"
) > $OUT
trap - EXIT
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 062f77279ce3..caa032ce3fe3 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -29,6 +29,7 @@
#include <asm/timer.h>
#include <asm/reboot.h>
#include <asm/nmi.h>
+#include <clocksource/hyperv_timer.h>
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
@@ -215,6 +216,10 @@ static void __init ms_hyperv_init_platform(void)
int hv_host_info_ecx;
int hv_host_info_edx;
+#ifdef CONFIG_PARAVIRT
+ pv_info.name = "Hyper-V";
+#endif
+
/*
* Extract the features and hints
*/
@@ -285,7 +290,12 @@ static void __init ms_hyperv_init_platform(void)
machine_ops.shutdown = hv_machine_shutdown;
machine_ops.crash_shutdown = hv_machine_crash_shutdown;
#endif
- mark_tsc_unstable("running on Hyper-V");
+ if (ms_hyperv.features & HV_X64_ACCESS_TSC_INVARIANT) {
+ wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ } else {
+ mark_tsc_unstable("running on Hyper-V");
+ }
/*
* Generation 2 instances don't support reading the NMI status from
@@ -338,6 +348,15 @@ static void __init ms_hyperv_init_platform(void)
x2apic_phys = 1;
# endif
+ /* Register Hyper-V specific clocksource */
+ hv_init_clocksource();
+#endif
+}
+
+void hv_setup_sched_clock(void *sched_clock)
+{
+#ifdef CONFIG_PARAVIRT
+ pv_ops.time.sched_clock = sched_clock;
#endif
}
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index aa5c064a6a22..51b9190c628b 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -15,7 +15,7 @@
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include "mtrr.h"
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 4d36dcc1cf87..a5c506f6da7f 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -101,9 +101,6 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
int length;
size_t linelen;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
memset(line, 0, LINE_SIZE);
len = min_t(size_t, len, LINE_SIZE - 1);
@@ -226,8 +223,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 0);
@@ -236,24 +231,18 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
break;
case MTRRIOC_DEL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 0);
break;
case MTRRIOC_KILL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_del(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_ENTRY:
@@ -279,8 +268,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 1);
@@ -289,8 +276,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
break;
@@ -298,16 +283,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 1);
break;
case MTRRIOC_KILL_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_del_page(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_PAGE_ENTRY:
@@ -373,28 +354,6 @@ static int mtrr_close(struct inode *ino, struct file *file)
return single_release(ino, file);
}
-static int mtrr_seq_show(struct seq_file *seq, void *offset);
-
-static int mtrr_open(struct inode *inode, struct file *file)
-{
- if (!mtrr_if)
- return -EIO;
- if (!mtrr_if->get)
- return -ENXIO;
- return single_open(file, mtrr_seq_show, NULL);
-}
-
-static const struct file_operations mtrr_fops = {
- .owner = THIS_MODULE,
- .open = mtrr_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = mtrr_write,
- .unlocked_ioctl = mtrr_ioctl,
- .compat_ioctl = mtrr_ioctl,
- .release = mtrr_close,
-};
-
static int mtrr_seq_show(struct seq_file *seq, void *offset)
{
char factor;
@@ -426,6 +385,29 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
return 0;
}
+static int mtrr_open(struct inode *inode, struct file *file)
+{
+ if (!mtrr_if)
+ return -EIO;
+ if (!mtrr_if->get)
+ return -ENXIO;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ return single_open(file, mtrr_seq_show, NULL);
+}
+
+static const struct proc_ops mtrr_proc_ops = {
+ .proc_open = mtrr_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = mtrr_write,
+ .proc_ioctl = mtrr_ioctl,
+#ifdef CONFIG_COMPAT
+ .proc_compat_ioctl = mtrr_ioctl,
+#endif
+ .proc_release = mtrr_close,
+};
+
static int __init mtrr_if_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -436,7 +418,7 @@ static int __init mtrr_if_init(void)
(!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
return -ENODEV;
- proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
+ proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_proc_ops);
return 0;
}
arch_initcall(mtrr_if_init);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 507039c20128..6a80f36b5d59 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -52,7 +52,7 @@
#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include "mtrr.h"
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index cb2e49810d68..4eec8889b0ff 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -7,6 +7,10 @@
#include "cpu.h"
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+extern const char * const x86_vmx_flags[NVMXINTS*32];
+#endif
+
/*
* Get CPU information for use by the procfs.
*/
@@ -102,6 +106,17 @@ static int show_cpuinfo(struct seq_file *m, void *v)
if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
seq_printf(m, " %s", x86_cap_flags[i]);
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ if (cpu_has(c, X86_FEATURE_VMX) && c->vmx_capability[0]) {
+ seq_puts(m, "\nvmx flags\t:");
+ for (i = 0; i < 32*NVMXINTS; i++) {
+ if (test_bit(i, (unsigned long *)c->vmx_capability) &&
+ x86_vmx_flags[i] != NULL)
+ seq_printf(m, " %s", x86_vmx_flags[i]);
+ }
+ }
+#endif
+
seq_puts(m, "\nbugs\t\t:");
for (i = 0; i < 32*NBUGINTS; i++) {
unsigned int bug_bit = 32*NCAPINTS + i;
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 5c900f9527ff..c4be62058dd9 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -29,7 +29,8 @@ __setup("nordrand", x86_rdrand_setup);
#ifdef CONFIG_ARCH_RANDOM
void x86_init_rdrand(struct cpuinfo_x86 *c)
{
- unsigned long tmp;
+ unsigned int changed = 0;
+ unsigned long tmp, prev;
int i;
if (!cpu_has(c, X86_FEATURE_RDRAND))
@@ -42,5 +43,24 @@ void x86_init_rdrand(struct cpuinfo_x86 *c)
return;
}
}
+
+ /*
+ * Stupid sanity-check whether RDRAND does *actually* generate
+ * some at least random-looking data.
+ */
+ prev = tmp;
+ for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
+ if (rdrand_long(&tmp)) {
+ if (prev != tmp)
+ changed++;
+
+ prev = tmp;
+ }
+ }
+
+ if (WARN_ON_ONCE(!changed))
+ pr_emerg(
+"RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\"");
+
}
#endif
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 03eb90d00af0..89049b343c7a 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -618,7 +618,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
if (static_branch_unlikely(&rdt_mon_enable_key))
rmdir_mondata_subdir_allrdtgrp(r, d->id);
list_del(&d->list);
- if (is_mbm_enabled())
+ if (r->mon_capable && is_mbm_enabled())
cancel_delayed_work(&d->mbm_over);
if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) {
/*
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index efbd54cc4e69..055c8613b531 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -522,6 +522,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
int ret = 0;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ ret = -ENOENT;
+ goto out;
+ }
md.priv = of->kn->priv;
resid = md.u.rid;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index e49b77283924..181c992f448c 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -57,6 +57,7 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
}
DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
/**
* struct mon_evt - Entry in the event list of a resource
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 397206f23d14..773124b0e18a 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -514,7 +514,7 @@ void mbm_handle_overflow(struct work_struct *work)
mutex_lock(&rdtgroup_mutex);
- if (!static_branch_likely(&rdt_enable_key))
+ if (!static_branch_likely(&rdt_mon_enable_key))
goto out_unlock;
d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
@@ -543,7 +543,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
unsigned long delay = msecs_to_jiffies(delay_ms);
int cpu;
- if (!static_branch_likely(&rdt_enable_key))
+ if (!static_branch_likely(&rdt_mon_enable_key))
return;
cpu = cpumask_any(&dom->cpu_mask);
dom->mbm_work_cpu = cpu;
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index a46dee8e78db..064e9ef44cd6 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -461,10 +461,8 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
}
rdtgrp = rdtgroup_kn_lock_live(of->kn);
- rdt_last_cmd_clear();
if (!rdtgrp) {
ret = -ENOENT;
- rdt_last_cmd_puts("Directory was removed\n");
goto unlock;
}
@@ -534,11 +532,15 @@ static void move_myself(struct callback_head *head)
kfree(rdtgrp);
}
+ if (unlikely(current->flags & PF_EXITING))
+ goto out;
+
preempt_disable();
/* update PQR_ASSOC MSR to make resource group go into effect */
resctrl_sched_in();
preempt_enable();
+out:
kfree(callback);
}
@@ -727,6 +729,92 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
return ret;
}
+#ifdef CONFIG_PROC_CPU_RESCTRL
+
+/*
+ * A task can only be part of one resctrl control group and of one monitor
+ * group which is associated to that control group.
+ *
+ * 1) res:
+ * mon:
+ *
+ * resctrl is not available.
+ *
+ * 2) res:/
+ * mon:
+ *
+ * Task is part of the root resctrl control group, and it is not associated
+ * to any monitor group.
+ *
+ * 3) res:/
+ * mon:mon0
+ *
+ * Task is part of the root resctrl control group and monitor group mon0.
+ *
+ * 4) res:group0
+ * mon:
+ *
+ * Task is part of resctrl control group group0, and it is not associated
+ * to any monitor group.
+ *
+ * 5) res:group0
+ * mon:mon1
+ *
+ * Task is part of resctrl control group group0 and monitor group mon1.
+ */
+int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
+{
+ struct rdtgroup *rdtg;
+ int ret = 0;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ /* Return empty if resctrl has not been mounted. */
+ if (!static_branch_unlikely(&rdt_enable_key)) {
+ seq_puts(s, "res:\nmon:\n");
+ goto unlock;
+ }
+
+ list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
+ struct rdtgroup *crg;
+
+ /*
+ * Task information is only relevant for shareable
+ * and exclusive groups.
+ */
+ if (rdtg->mode != RDT_MODE_SHAREABLE &&
+ rdtg->mode != RDT_MODE_EXCLUSIVE)
+ continue;
+
+ if (rdtg->closid != tsk->closid)
+ continue;
+
+ seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
+ rdtg->kn->name);
+ seq_puts(s, "mon:");
+ list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
+ mon.crdtgrp_list) {
+ if (tsk->rmid != crg->mon.rmid)
+ continue;
+ seq_printf(s, "%s", crg->kn->name);
+ break;
+ }
+ seq_putc(s, '\n');
+ goto unlock;
+ }
+ /*
+ * The above search should succeed. Otherwise return
+ * with an error.
+ */
+ ret = -ENOENT;
+unlock:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret;
+}
+#endif
+
static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
@@ -1743,9 +1831,6 @@ static int set_cache_qos_cfg(int level, bool enable)
struct rdt_domain *d;
int cpu;
- if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
- return -ENOMEM;
-
if (level == RDT_RESOURCE_L3)
update = l3_qos_cfg_update;
else if (level == RDT_RESOURCE_L2)
@@ -1753,6 +1838,9 @@ static int set_cache_qos_cfg(int level, bool enable)
else
return -EINVAL;
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
r_l = &rdt_resources_all[level];
list_for_each_entry(d, &r_l->domains, list) {
/* Pick one CPU from each domain instance to update MSR */
@@ -1972,7 +2060,7 @@ static int rdt_get_tree(struct fs_context *fc)
if (rdt_mon_capable) {
ret = mongroup_create_dir(rdtgroup_default.kn,
- NULL, "mon_groups",
+ &rdtgroup_default, "mon_groups",
&kn_mongrp);
if (ret < 0)
goto out_info;
@@ -2039,25 +2127,20 @@ enum rdt_param {
nr__rdt_params
};
-static const struct fs_parameter_spec rdt_param_specs[] = {
+static const struct fs_parameter_spec rdt_fs_parameters[] = {
fsparam_flag("cdp", Opt_cdp),
fsparam_flag("cdpl2", Opt_cdpl2),
fsparam_flag("mba_MBps", Opt_mba_mbps),
{}
};
-static const struct fs_parameter_description rdt_fs_parameters = {
- .name = "rdt",
- .specs = rdt_param_specs,
-};
-
static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct rdt_fs_context *ctx = rdt_fc2context(fc);
struct fs_parse_result result;
int opt;
- opt = fs_parse(fc, &rdt_fs_parameters, param, &result);
+ opt = fs_parse(fc, rdt_fs_parameters, param, &result);
if (opt < 0)
return opt;
@@ -2207,7 +2290,11 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
free_rmid(sentry->mon.rmid);
list_del(&sentry->mon.crdtgrp_list);
- kfree(sentry);
+
+ if (atomic_read(&sentry->waitcount) != 0)
+ sentry->flags = RDT_DELETED;
+ else
+ kfree(sentry);
}
}
@@ -2245,7 +2332,11 @@ static void rmdir_all_sub(void)
kernfs_remove(rdtgrp->kn);
list_del(&rdtgrp->rdtgroup_list);
- kfree(rdtgrp);
+
+ if (atomic_read(&rdtgrp->waitcount) != 0)
+ rdtgrp->flags = RDT_DELETED;
+ else
+ kfree(rdtgrp);
}
/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
update_closid_rmid(cpu_online_mask, &rdtgroup_default);
@@ -2282,7 +2373,7 @@ static void rdt_kill_sb(struct super_block *sb)
static struct file_system_type rdt_fs_type = {
.name = "resctrl",
.init_fs_context = rdt_init_fs_context,
- .parameters = &rdt_fs_parameters,
+ .parameters = rdt_fs_parameters,
.kill_sb = rdt_kill_sb,
};
@@ -2448,7 +2539,7 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn,
/*
* Create the mon_data directory first.
*/
- ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
+ ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
if (ret)
return ret;
@@ -2638,7 +2729,6 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
}
static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
- struct kernfs_node *prgrp_kn,
const char *name, umode_t mode,
enum rdt_group_type rtype, struct rdtgroup **r)
{
@@ -2647,11 +2737,9 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
uint files = 0;
int ret;
- prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
- rdt_last_cmd_clear();
+ prdtgrp = rdtgroup_kn_lock_live(parent_kn);
if (!prdtgrp) {
ret = -ENODEV;
- rdt_last_cmd_puts("Directory was removed\n");
goto out_unlock;
}
@@ -2722,7 +2810,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
kernfs_activate(kn);
/*
- * The caller unlocks the prgrp_kn upon success.
+ * The caller unlocks the parent_kn upon success.
*/
return 0;
@@ -2733,7 +2821,7 @@ out_destroy:
out_free_rgrp:
kfree(rdtgrp);
out_unlock:
- rdtgroup_kn_unlock(prgrp_kn);
+ rdtgroup_kn_unlock(parent_kn);
return ret;
}
@@ -2750,15 +2838,12 @@ static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
* to monitor a subset of tasks and cpus in its parent ctrl_mon group.
*/
static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
- struct kernfs_node *prgrp_kn,
- const char *name,
- umode_t mode)
+ const char *name, umode_t mode)
{
struct rdtgroup *rdtgrp, *prgrp;
int ret;
- ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
- &rdtgrp);
+ ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
if (ret)
return ret;
@@ -2771,7 +2856,7 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
*/
list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
- rdtgroup_kn_unlock(prgrp_kn);
+ rdtgroup_kn_unlock(parent_kn);
return ret;
}
@@ -2780,7 +2865,6 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
* to allocate and monitor resources.
*/
static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
- struct kernfs_node *prgrp_kn,
const char *name, umode_t mode)
{
struct rdtgroup *rdtgrp;
@@ -2788,8 +2872,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
u32 closid;
int ret;
- ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
- &rdtgrp);
+ ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
if (ret)
return ret;
@@ -2814,7 +2897,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
* Create an empty mon_groups directory to hold the subset
* of tasks and cpus to monitor.
*/
- ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
+ ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
if (ret) {
rdt_last_cmd_puts("kernfs subdir error\n");
goto out_del_list;
@@ -2830,7 +2913,7 @@ out_id_free:
out_common_fail:
mkdir_rdt_prepare_clean(rdtgrp);
out_unlock:
- rdtgroup_kn_unlock(prgrp_kn);
+ rdtgroup_kn_unlock(parent_kn);
return ret;
}
@@ -2863,14 +2946,14 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
* subdirectory
*/
if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
- return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
+ return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
/*
* If RDT monitoring is supported and the parent directory is a valid
* "mon_groups" directory, add a monitoring subdirectory.
*/
if (rdt_mon_capable && is_mon_groups(parent_kn, name))
- return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
+ return rdtgroup_mkdir_mon(parent_kn, name, mode);
return -EPERM;
}
@@ -2956,13 +3039,13 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
closid_free(rdtgrp->closid);
free_rmid(rdtgrp->mon.rmid);
+ rdtgroup_ctrl_remove(kn, rdtgrp);
+
/*
* Free all the child monitor group rmids.
*/
free_all_child_rdtgrp(rdtgrp);
- rdtgroup_ctrl_remove(kn, rdtgrp);
-
return 0;
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index adf9b71386ef..62b137c3c97a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -4,7 +4,7 @@
*/
#include <linux/cpu.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/apic.h>
#include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index ee48c3fc8a65..d3a0791bc052 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -7,7 +7,7 @@
#include <linux/cpu.h>
#include <asm/apic.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/processor.h>
#include "cpu.h"
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
new file mode 100644
index 000000000000..e2ad30e474f8
--- /dev/null
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Transactional Synchronization Extensions (TSX) control.
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author:
+ * Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
+ */
+
+#include <linux/cpufeature.h>
+
+#include <asm/cmdline.h>
+
+#include "cpu.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tsx: " fmt
+
+enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
+
+void tsx_disable(void)
+{
+ u64 tsx;
+
+ rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Force all transactions to immediately abort */
+ tsx |= TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is not enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * do not waste resources trying TSX transactions that
+ * will always abort.
+ */
+ tsx |= TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+}
+
+void tsx_enable(void)
+{
+ u64 tsx;
+
+ rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Enable the RTM feature in the cpu */
+ tsx &= ~TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * can enumerate and use the TSX feature.
+ */
+ tsx &= ~TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+}
+
+static bool __init tsx_ctrl_is_supported(void)
+{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
+ /*
+ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
+ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
+ *
+ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
+ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
+ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
+ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
+ * tsx= cmdline requests will do nothing on CPUs without
+ * MSR_IA32_TSX_CTRL support.
+ */
+ return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR);
+}
+
+static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_TAA))
+ return TSX_CTRL_DISABLE;
+
+ return TSX_CTRL_ENABLE;
+}
+
+void __init tsx_init(void)
+{
+ char arg[5] = {};
+ int ret;
+
+ if (!tsx_ctrl_is_supported())
+ return;
+
+ ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));
+ if (ret >= 0) {
+ if (!strcmp(arg, "on")) {
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ } else if (!strcmp(arg, "off")) {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ } else if (!strcmp(arg, "auto")) {
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
+ } else {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ pr_err("invalid option, defaulting to off\n");
+ }
+ } else {
+ /* tsx= not provided */
+ if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO))
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
+ else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF))
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ else
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ }
+
+ if (tsx_ctrl_state == TSX_CTRL_DISABLE) {
+ tsx_disable();
+
+ /*
+ * tsx_disable() will change the state of the RTM and HLE CPUID
+ * bits. Clear them here since they are now expected to be not
+ * set.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
+ setup_clear_cpu_cap(X86_FEATURE_HLE);
+ } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) {
+
+ /*
+ * HW defaults TSX to be enabled at bootup.
+ * We may still need the TSX enable support
+ * during init for special cases like
+ * kexec after TSX is disabled.
+ */
+ tsx_enable();
+
+ /*
+ * tsx_enable() will change the state of the RTM and HLE CPUID
+ * bits. Force them here since they are now expected to be set.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RTM);
+ setup_force_cpu_cap(X86_FEATURE_HLE);
+ }
+}
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
index 32b4dc9030aa..c222f283b456 100644
--- a/arch/x86/kernel/cpu/umwait.c
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -17,6 +17,12 @@
*/
static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
+u32 get_umwait_control_msr(void)
+{
+ return umwait_control_cached;
+}
+EXPORT_SYMBOL_GPL(get_umwait_control_msr);
+
/*
* Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by
* hardware or BIOS before kernel boot.
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 3c648476d4fb..46d732696c1c 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -30,34 +30,69 @@
#include <asm/hypervisor.h>
#include <asm/timer.h>
#include <asm/apic.h>
+#include <asm/vmware.h>
#undef pr_fmt
#define pr_fmt(fmt) "vmware: " fmt
-#define CPUID_VMWARE_INFO_LEAF 0x40000000
+#define CPUID_VMWARE_INFO_LEAF 0x40000000
+#define CPUID_VMWARE_FEATURES_LEAF 0x40000010
+#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0)
+#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1)
+
#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
-#define VMWARE_HYPERVISOR_PORT 0x5658
-#define VMWARE_PORT_CMD_GETVERSION 10
-#define VMWARE_PORT_CMD_GETHZ 45
-#define VMWARE_PORT_CMD_GETVCPU_INFO 68
-#define VMWARE_PORT_CMD_LEGACY_X2APIC 3
-#define VMWARE_PORT_CMD_VCPU_RESERVED 31
+#define VMWARE_CMD_GETVERSION 10
+#define VMWARE_CMD_GETHZ 45
+#define VMWARE_CMD_GETVCPU_INFO 68
+#define VMWARE_CMD_LEGACY_X2APIC 3
+#define VMWARE_CMD_VCPU_RESERVED 31
#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
- __asm__("inl (%%dx)" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "0"(VMWARE_HYPERVISOR_MAGIC), \
- "1"(VMWARE_PORT_CMD_##cmd), \
- "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
- "memory");
+ __asm__("inl (%%dx), %%eax" : \
+ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
+ "a"(VMWARE_HYPERVISOR_MAGIC), \
+ "c"(VMWARE_CMD_##cmd), \
+ "d"(VMWARE_HYPERVISOR_PORT), "b"(UINT_MAX) : \
+ "memory")
+
+#define VMWARE_VMCALL(cmd, eax, ebx, ecx, edx) \
+ __asm__("vmcall" : \
+ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
+ "a"(VMWARE_HYPERVISOR_MAGIC), \
+ "c"(VMWARE_CMD_##cmd), \
+ "d"(0), "b"(UINT_MAX) : \
+ "memory")
+
+#define VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx) \
+ __asm__("vmmcall" : \
+ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
+ "a"(VMWARE_HYPERVISOR_MAGIC), \
+ "c"(VMWARE_CMD_##cmd), \
+ "d"(0), "b"(UINT_MAX) : \
+ "memory")
+
+#define VMWARE_CMD(cmd, eax, ebx, ecx, edx) do { \
+ switch (vmware_hypercall_mode) { \
+ case CPUID_VMWARE_FEATURES_ECX_VMCALL: \
+ VMWARE_VMCALL(cmd, eax, ebx, ecx, edx); \
+ break; \
+ case CPUID_VMWARE_FEATURES_ECX_VMMCALL: \
+ VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx); \
+ break; \
+ default: \
+ VMWARE_PORT(cmd, eax, ebx, ecx, edx); \
+ break; \
+ } \
+ } while (0)
static unsigned long vmware_tsc_khz __ro_after_init;
+static u8 vmware_hypercall_mode __ro_after_init;
static inline int __vmware_platform(void)
{
uint32_t eax, ebx, ecx, edx;
- VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx);
+ VMWARE_CMD(GETVERSION, eax, ebx, ecx, edx);
return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
}
@@ -129,6 +164,10 @@ static void __init vmware_set_capabilities(void)
{
setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL)
+ setup_force_cpu_cap(X86_FEATURE_VMCALL);
+ else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL)
+ setup_force_cpu_cap(X86_FEATURE_VMW_VMMCALL);
}
static void __init vmware_platform_setup(void)
@@ -136,7 +175,7 @@ static void __init vmware_platform_setup(void)
uint32_t eax, ebx, ecx, edx;
uint64_t lpj, tsc_khz;
- VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+ VMWARE_CMD(GETHZ, eax, ebx, ecx, edx);
if (ebx != UINT_MAX) {
lpj = tsc_khz = eax | (((uint64_t)ebx) << 32);
@@ -174,10 +213,21 @@ static void __init vmware_platform_setup(void)
vmware_set_capabilities();
}
+static u8 vmware_select_hypercall(void)
+{
+ int eax, ebx, ecx, edx;
+
+ cpuid(CPUID_VMWARE_FEATURES_LEAF, &eax, &ebx, &ecx, &edx);
+ return (ecx & (CPUID_VMWARE_FEATURES_ECX_VMMCALL |
+ CPUID_VMWARE_FEATURES_ECX_VMCALL));
+}
+
/*
* While checking the dmi string information, just checking the product
* serial key should be enough, as this will always have a VMware
* specific string when running under VMware hypervisor.
+ * If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode
+ * intentionally defaults to 0.
*/
static uint32_t __init vmware_platform(void)
{
@@ -187,8 +237,16 @@ static uint32_t __init vmware_platform(void)
cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
&hyper_vendor_id[1], &hyper_vendor_id[2]);
- if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
+ if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) {
+ if (eax >= CPUID_VMWARE_FEATURES_LEAF)
+ vmware_hypercall_mode =
+ vmware_select_hypercall();
+
+ pr_info("hypercall mode: 0x%02x\n",
+ (unsigned int) vmware_hypercall_mode);
+
return CPUID_VMWARE_INFO_LEAF;
+ }
} else if (dmi_available && dmi_name_in_serial("VMware") &&
__vmware_platform())
return 1;
@@ -200,9 +258,9 @@ static uint32_t __init vmware_platform(void)
static bool __init vmware_legacy_x2apic_available(void)
{
uint32_t eax, ebx, ecx, edx;
- VMWARE_PORT(GETVCPU_INFO, eax, ebx, ecx, edx);
- return (eax & (1 << VMWARE_PORT_CMD_VCPU_RESERVED)) == 0 &&
- (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0;
+ VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx);
+ return (eax & (1 << VMWARE_CMD_VCPU_RESERVED)) == 0 &&
+ (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0;
}
const __initconst struct hypervisor_x86 x86_hyper_vmware = {
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
index 8e6f2f4b4afe..df1358ba622b 100644
--- a/arch/x86/kernel/cpu/zhaoxin.c
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -16,13 +16,6 @@
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */
-#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
-#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
-#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
-#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
-#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
-#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
-
static void init_zhaoxin_cap(struct cpuinfo_x86 *c)
{
u32 lo, hi;
@@ -58,8 +51,6 @@ static void init_zhaoxin_cap(struct cpuinfo_x86 *c)
if (c->x86 >= 0x6)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
- cpu_detect_cache_sizes(c);
}
static void early_init_zhaoxin(struct cpuinfo_x86 *c)
@@ -89,31 +80,6 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c)
}
-static void zhaoxin_detect_vmx_virtcap(struct cpuinfo_x86 *c)
-{
- u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
-
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
- msr_ctl = vmx_msr_high | vmx_msr_low;
-
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
- set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
- set_cpu_cap(c, X86_FEATURE_VNMI);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx_msr_low, vmx_msr_high);
- msr_ctl2 = vmx_msr_high | vmx_msr_low;
- if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
- (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
- set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
- set_cpu_cap(c, X86_FEATURE_EPT);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
- set_cpu_cap(c, X86_FEATURE_VPID);
- }
-}
-
static void init_zhaoxin(struct cpuinfo_x86 *c)
{
early_init_zhaoxin(c);
@@ -141,8 +107,7 @@ static void init_zhaoxin(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
- if (cpu_has(c, X86_FEATURE_VMX))
- zhaoxin_detect_vmx_virtcap(c);
+ init_ia32_feat_ctl(c);
}
#ifdef CONFIG_X86_32
OpenPOWER on IntegriCloud