diff options
Diffstat (limited to 'arch/x86/kernel')
70 files changed, 1455 insertions, 1208 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0503f5bfb18d..45257cf84370 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -125,6 +125,12 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o +ifdef CONFIG_FRAME_POINTER +obj-y += unwind_frame.o +else +obj-y += unwind_guess.o +endif + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 3242e591fa82..26b78d86f25a 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o obj-$(CONFIG_ACPI_APEI) += apei.o +obj-$(CONFIG_ACPI_CPPC_LIB) += cppc_msr.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 90d84c3eee53..32a7d70913ac 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -176,15 +176,10 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return -EINVAL; } - if (!enabled) { - ++disabled_cpus; - return -EINVAL; - } - if (boot_cpu_physical_apicid != -1U) - ver = apic_version[boot_cpu_physical_apicid]; + ver = boot_cpu_apic_version; - cpu = generic_processor_info(id, ver); + cpu = __generic_processor_info(id, ver, enabled); if (cpu >= 0) early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; @@ -282,6 +277,8 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) return -EINVAL; + acpi_table_print_madt_entry(header); + acpi_lapic_addr = lapic_addr_ovr->address; return 0; @@ -705,7 +702,7 @@ static void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include <acpi/processor.h> -static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; @@ -716,6 +713,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) numa_set_node(cpu, nid); } #endif + return 0; } int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu) @@ -998,21 +996,6 @@ static int __init acpi_parse_madt_lapic_entries(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; - /* - * Note that the LAPIC address is obtained from the MADT (32-bit value) - * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value). - */ - - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, - acpi_parse_lapic_addr_ovr, 0); - if (count < 0) { - printk(KERN_ERR PREFIX - "Error parsing LAPIC address override entry\n"); - return count; - } - - register_lapic_address(acpi_lapic_addr); - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, acpi_parse_sapic, MAX_LOCAL_APIC); @@ -1031,8 +1014,8 @@ static int __init acpi_parse_madt_lapic_entries(void) return ret; } - x2count = madt_proc[0].count; - count = madt_proc[1].count; + count = madt_proc[0].count; + x2count = madt_proc[1].count; } if (!count && !x2count) { printk(KERN_ERR PREFIX "No LAPIC entries present\n"); @@ -1513,7 +1496,7 @@ void __init acpi_boot_table_init(void) * If acpi_disabled, bail out */ if (acpi_disabled) - return; + return; /* * Initialize the ACPI boot-time table parser. diff --git a/arch/x86/kernel/acpi/cppc_msr.c b/arch/x86/kernel/acpi/cppc_msr.c new file mode 100644 index 000000000000..6fb478bf82fd --- /dev/null +++ b/arch/x86/kernel/acpi/cppc_msr.c @@ -0,0 +1,58 @@ +/* + * cppc_msr.c: MSR Interface for CPPC + * Copyright (c) 2016, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include <acpi/cppc_acpi.h> +#include <asm/msr.h> + +/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ + +bool cpc_ffh_supported(void) +{ + return true; +} + +int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) +{ + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + *val &= mask; + *val >>= reg->bit_offset; + } + return err; +} + +int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +{ + u64 rd_val; + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + val <<= reg->bit_offset; + val &= mask; + rd_val &= ~mask; + rd_val |= val; + err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val); + } + return err; +} diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index adb3eaf8fe2a..48587335ede8 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -99,7 +99,7 @@ int x86_acpi_suspend_lowlevel(void) saved_magic = 0x12345678; #else /* CONFIG_64BIT */ #ifdef CONFIG_SMP - stack_start = (unsigned long)temp_stack + sizeof(temp_stack); + initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(smp_processor_id()); initial_gs = per_cpu_offset(smp_processor_id()); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 50c95af0f017..f266b8a92a9e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -64,6 +64,8 @@ unsigned disabled_cpus; unsigned int boot_cpu_physical_apicid = -1U; EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); +u8 boot_cpu_apic_version; + /* * The highest APIC ID seen during enumeration. */ @@ -1374,7 +1376,6 @@ void setup_local_APIC(void) * Actually disabling the focus CPU check just makes the hang less * frequent as it makes the interrupt distributon model be more * like LRU than MRU (the short-term load is more even across CPUs). - * See also the comment in end_level_ioapic_irq(). --macro */ /* @@ -1816,8 +1817,7 @@ void __init init_apic_mappings(void) * since smp_sanity_check is prepared for such a case * and disable smp mode */ - apic_version[new_apicid] = - GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } @@ -1828,17 +1828,14 @@ void __init register_lapic_address(unsigned long address) if (!x2apic_mode) { set_fixmap_nocache(FIX_APIC_BASE, address); apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, mp_lapic_addr); + APIC_BASE, address); } if (boot_cpu_physical_apicid == -1U) { boot_cpu_physical_apicid = read_apic_id(); - apic_version[boot_cpu_physical_apicid] = - GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } -int apic_version[MAX_LOCAL_APIC]; - /* * Local APIC interrupts */ @@ -2027,7 +2024,53 @@ void disconnect_bsp_APIC(int virt_wire_setup) apic_write(APIC_LVT1, value); } -int generic_processor_info(int apicid, int version) +/* + * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated + * contiguously, it equals to current allocated max logical CPU ID plus 1. + * All allocated CPU ID should be in [0, nr_logical_cpuidi), so the maximum of + * nr_logical_cpuids is nr_cpu_ids. + * + * NOTE: Reserve 0 for BSP. + */ +static int nr_logical_cpuids = 1; + +/* + * Used to store mapping between logical CPU IDs and APIC IDs. + */ +static int cpuid_to_apicid[] = { + [0 ... NR_CPUS - 1] = -1, +}; + +/* + * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids + * and cpuid_to_apicid[] synchronized. + */ +static int allocate_logical_cpuid(int apicid) +{ + int i; + + /* + * cpuid <-> apicid mapping is persistent, so when a cpu is up, + * check if the kernel has allocated a cpuid for it. + */ + for (i = 0; i < nr_logical_cpuids; i++) { + if (cpuid_to_apicid[i] == apicid) + return i; + } + + /* Allocate a new cpuid. */ + if (nr_logical_cpuids >= nr_cpu_ids) { + WARN_ONCE(1, "Only %d processors supported." + "Processor %d/0x%x and the rest are ignored.\n", + nr_cpu_ids - 1, nr_logical_cpuids, apicid); + return -1; + } + + cpuid_to_apicid[nr_logical_cpuids] = apicid; + return nr_logical_cpuids++; +} + +int __generic_processor_info(int apicid, int version, bool enabled) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2093,7 +2136,6 @@ int generic_processor_info(int apicid, int version) return -EINVAL; } - num_processors++; if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed @@ -2103,8 +2145,16 @@ int generic_processor_info(int apicid, int version) * for BSP. */ cpu = 0; - } else - cpu = cpumask_next_zero(-1, cpu_present_mask); + + /* Logical cpuid 0 is reserved for BSP. */ + cpuid_to_apicid[0] = apicid; + } else { + cpu = allocate_logical_cpuid(apicid); + if (cpu < 0) { + disabled_cpus++; + return -EINVAL; + } + } /* * This can happen on physical hotplug. The sanity check at boot time @@ -2116,6 +2166,7 @@ int generic_processor_info(int apicid, int version) pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n", thiscpu, apicid); + disabled_cpus++; return -ENOSPC; } @@ -2128,14 +2179,12 @@ int generic_processor_info(int apicid, int version) cpu, apicid); version = 0x10; } - apic_version[apicid] = version; - if (version != apic_version[boot_cpu_physical_apicid]) { + if (version != boot_cpu_apic_version) { pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", - apic_version[boot_cpu_physical_apicid], cpu, version); + boot_cpu_apic_version, cpu, version); } - physid_set(apicid, phys_cpu_present_map); if (apicid > max_physical_apicid) max_physical_apicid = apicid; @@ -2148,11 +2197,23 @@ int generic_processor_info(int apicid, int version) apic->x86_32_early_logical_apicid(cpu); #endif set_cpu_possible(cpu, true); - set_cpu_present(cpu, true); + + if (enabled) { + num_processors++; + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + } else { + disabled_cpus++; + } return cpu; } +int generic_processor_info(int apicid, int version) +{ + return __generic_processor_info(apicid, version, true); +} + int hard_smp_processor_id(void) { return read_apic_id(); @@ -2275,7 +2336,7 @@ int __init APIC_init_uniprocessor(void) * Complain if the BIOS pretends there is one. */ if (!boot_cpu_has(X86_FEATURE_APIC) && - APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + APIC_INTEGRATED(boot_cpu_apic_version)) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); return -1; diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 5b2ae106bd4a..a4d7ff20ed22 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -25,7 +25,7 @@ static struct apic apic_physflat; static struct apic apic_flat; -struct apic __read_mostly *apic = &apic_flat; +struct apic *apic __ro_after_init = &apic_flat; EXPORT_SYMBOL_GPL(apic); static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) @@ -116,27 +116,17 @@ static void flat_send_IPI_all(int vector) static unsigned int flat_get_apic_id(unsigned long x) { - unsigned int id; - - id = (((x)>>24) & 0xFFu); - - return id; + return (x >> 24) & 0xFF; } static unsigned long set_apic_id(unsigned int id) { - unsigned long x; - - x = ((id & 0xFFu)<<24); - return x; + return (id & 0xFF) << 24; } static unsigned int read_xapic_id(void) { - unsigned int id; - - id = flat_get_apic_id(apic_read(APIC_ID)); - return id; + return flat_get_apic_id(apic_read(APIC_ID)); } static int flat_apic_id_registered(void) @@ -154,7 +144,7 @@ static int flat_probe(void) return 1; } -static struct apic apic_flat = { +static struct apic apic_flat __ro_after_init = { .name = "flat", .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, @@ -248,7 +238,7 @@ static int physflat_probe(void) return 0; } -static struct apic apic_physflat = { +static struct apic apic_physflat __ro_after_init = { .name = "physical flat", .probe = physflat_probe, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index c05688b2deff..b109e4389c92 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -108,7 +108,7 @@ static void noop_apic_write(u32 reg, u32 v) WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); } -struct apic apic_noop = { +struct apic apic_noop __ro_after_init = { .name = "noop", .probe = noop_probe, .acpi_madt_oem_check = NULL, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 714d4fda0d52..e08fe2c8dd8c 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -40,10 +40,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x) static unsigned long numachip1_set_apic_id(unsigned int id) { - unsigned long x; - - x = ((id & 0xffU) << 24); - return x; + return (id & 0xff) << 24; } static unsigned int numachip2_get_apic_id(unsigned long x) diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 06dbaa458bfe..56012010332c 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -142,7 +142,7 @@ static int probe_bigsmp(void) return dmi_bigsmp; } -static struct apic apic_bigsmp = { +static struct apic apic_bigsmp __ro_after_init = { .name = "bigsmp", .probe = probe_bigsmp, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7491f417a8e4..48e6d84f173e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1593,7 +1593,7 @@ void __init setup_ioapic_ids_from_mpc(void) * no meaning without the serial APIC bus. */ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + || APIC_XAPIC(boot_cpu_apic_version)) return; setup_ioapic_ids_from_mpc_nocheck(); } @@ -2423,7 +2423,7 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) static u8 io_apic_unique_id(int idx, u8 id) { if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + !APIC_XAPIC(boot_cpu_apic_version)) return io_apic_get_unique_id(idx, id); else return id; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index ade25320df96..015bbf30e3e3 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -269,7 +269,7 @@ static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) hpet_msi_write(irq_data_get_irq_handler_data(data), msg); } -static struct irq_chip hpet_msi_controller = { +static struct irq_chip hpet_msi_controller __ro_after_init = { .name = "HPET-MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 7c43e716c158..c48264e202fd 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -72,7 +72,7 @@ static int probe_default(void) return 1; } -static struct apic apic_default = { +static struct apic apic_default __ro_after_init = { .name = "default", .probe = probe_default, @@ -126,7 +126,7 @@ static struct apic apic_default = { apic_driver(apic_default); -struct apic *apic = &apic_default; +struct apic *apic __ro_after_init = &apic_default; EXPORT_SYMBOL_GPL(apic); static int cmdline_apic __initdata; @@ -152,7 +152,7 @@ early_param("apic", parse_apic); void __init default_setup_apic_routing(void) { - int version = apic_version[boot_cpu_physical_apicid]; + int version = boot_cpu_apic_version; if (num_possible_cpus() > 8) { switch (boot_cpu_data.x86_vendor) { diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 54f35d988025..200af5ae9662 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -227,7 +227,7 @@ static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); } -static struct apic apic_x2apic_cluster = { +static struct apic apic_x2apic_cluster __ro_after_init = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 4f13f54f1b1f..ff111f05a314 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -98,7 +98,7 @@ static int x2apic_phys_probe(void) return apic == &apic_x2apic_phys; } -static struct apic apic_x2apic_phys = { +static struct apic apic_x2apic_phys __ro_after_init = { .name = "physical x2apic", .probe = x2apic_phys_probe, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index cb0673c1e940..aeef53ce93e1 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -533,11 +533,8 @@ static unsigned int x2apic_get_apic_id(unsigned long x) static unsigned long set_apic_id(unsigned int id) { - unsigned long x; - - /* maskout x2apic_extra_bits ? */ - x = id; - return x; + /* CHECKME: Do we need to mask out the xapic extra bits? */ + return id; } static unsigned int uv_read_apic_id(void) @@ -560,7 +557,7 @@ static int uv_probe(void) return apic == &apic_x2apic_uv_x; } -static struct apic __refdata apic_x2apic_uv_x = { +static struct apic apic_x2apic_uv_x __ro_after_init = { .name = "UV large system", .probe = uv_probe, @@ -927,7 +924,7 @@ static void uv_heartbeat(unsigned long ignored) mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); } -static void uv_heartbeat_enable(int cpu) +static int uv_heartbeat_enable(unsigned int cpu) { while (!uv_cpu_scir_info(cpu)->enabled) { struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; @@ -941,43 +938,24 @@ static void uv_heartbeat_enable(int cpu) /* also ensure that boot cpu is enabled */ cpu = 0; } + return 0; } #ifdef CONFIG_HOTPLUG_CPU -static void uv_heartbeat_disable(int cpu) +static int uv_heartbeat_disable(unsigned int cpu) { if (uv_cpu_scir_info(cpu)->enabled) { uv_cpu_scir_info(cpu)->enabled = 0; del_timer(&uv_cpu_scir_info(cpu)->timer); } uv_set_cpu_scir_bits(cpu, 0xff); -} - -/* - * cpu hotplug notifier - */ -static int uv_scir_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_FAILED: - case CPU_ONLINE: - uv_heartbeat_enable(cpu); - break; - case CPU_DOWN_PREPARE: - uv_heartbeat_disable(cpu); - break; - default: - break; - } - return NOTIFY_OK; + return 0; } static __init void uv_scir_register_cpu_notifier(void) { - hotcpu_notifier(uv_scir_cpu_notify, 0); + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/x2apic-uvx:online", + uv_heartbeat_enable, uv_heartbeat_disable); } #else /* !CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 2bd5c6ff7ee7..c62e015b126c 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -29,10 +29,13 @@ void common(void) { BLANK(); - OFFSET(TI_flags, thread_info, flags); - OFFSET(TI_status, thread_info, status); + OFFSET(TASK_threadsp, task_struct, thread.sp); +#ifdef CONFIG_CC_STACKPROTECTOR + OFFSET(TASK_stack_canary, task_struct, stack_canary); +#endif BLANK(); + OFFSET(TASK_TI_flags, task_struct, thread_info.flags); OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); BLANK(); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index ecdc1d217dc0..880aa093268d 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -57,6 +57,11 @@ void foo(void) /* Size of SYSENTER_stack */ DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); +#ifdef CONFIG_CC_STACKPROTECTOR + BLANK(); + OFFSET(stack_canary_offset, stack_canary, canary); +#endif + #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index d875f97d4e0b..210927ee2e74 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -56,6 +56,11 @@ int main(void) OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); BLANK(); +#ifdef CONFIG_CC_STACKPROTECTOR + DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); + BLANK(); +#endif + DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 809eda03c527..9bd910a7dd0a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -804,21 +804,20 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (!have_cpuid_p()) - return; - - cpu_detect(c); - get_cpu_vendor(c); - get_cpu_cap(c); + if (have_cpuid_p()) { + cpu_detect(c); + get_cpu_vendor(c); + get_cpu_cap(c); - if (this_cpu->c_early_init) - this_cpu->c_early_init(c); + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); - c->cpu_index = 0; - filter_cpuid_features(c, false); + c->cpu_index = 0; + filter_cpuid_features(c, false); - if (this_cpu->c_bsp_init) - this_cpu->c_bsp_init(c); + if (this_cpu->c_bsp_init) + this_cpu->c_bsp_init(c); + } setup_force_cpu_cap(X86_FEATURE_ALWAYS); fpu__init_system(c); @@ -1265,9 +1264,14 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; -struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, - (unsigned long) debug_idt_table }; +struct desc_ptr idt_descr __ro_after_init = { + .size = NR_VECTORS * 16 - 1, + .address = (unsigned long) idt_table, +}; +const struct desc_ptr debug_idt_descr = { + .size = NR_VECTORS * 16 - 1, + .address = (unsigned long) debug_idt_table, +}; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; @@ -1281,7 +1285,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE; DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; @@ -1305,11 +1309,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks /* May not be marked __init: used by software suspend */ void syscall_init(void) { - /* - * LSTAR and STAR live in a bit strange symbiosis. - * They both write to the same internal register. STAR allows to - * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. - */ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 27e46658ebe3..35691a6b0d32 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -86,3 +86,14 @@ bool __init hypervisor_x2apic_available(void) x86_hyper->x2apic_available && x86_hyper->x2apic_available(); } + +void hypervisor_pin_vcpu(int cpu) +{ + if (!x86_hyper) + return; + + if (x86_hyper->pin_vcpu) + x86_hyper->pin_vcpu(cpu); + else + WARN_ONCE(1, "vcpu pinning requested but not supported!\n"); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 79d8ec849468..a7fdf453d895 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -41,6 +41,7 @@ #include <linux/debugfs.h> #include <linux/irq_work.h> #include <linux/export.h> +#include <linux/jump_label.h> #include <asm/processor.h> #include <asm/traps.h> @@ -292,6 +293,13 @@ static void print_mce(struct mce *m) if (m->misc) pr_cont("MISC %llx ", m->misc); + if (mce_flags.smca) { + if (m->synd) + pr_cont("SYND %llx ", m->synd); + if (m->ipid) + pr_cont("IPID %llx ", m->ipid); + } + pr_cont("\n"); /* * Note this output is parsed by external tools and old fields @@ -568,6 +576,7 @@ static void mce_read_aux(struct mce *m, int i) { if (m->status & MCI_STATUS_MISCV) m->misc = mce_rdmsrl(msr_ops.misc(i)); + if (m->status & MCI_STATUS_ADDRV) { m->addr = mce_rdmsrl(msr_ops.addr(i)); @@ -579,6 +588,23 @@ static void mce_read_aux(struct mce *m, int i) m->addr >>= shift; m->addr <<= shift; } + + /* + * Extract [55:<lsb>] where lsb is the least significant + * *valid* bit of the address bits. + */ + if (mce_flags.smca) { + u8 lsb = (m->addr >> 56) & 0x3f; + + m->addr &= GENMASK_ULL(55, lsb); + } + } + + if (mce_flags.smca) { + m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + + if (m->status & MCI_STATUS_SYNDV) + m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); } } @@ -1633,17 +1659,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model == 45) quirk_no_way_out = quirk_sandybridge_ifu; - /* - * MCG_CAP.MCG_SER_P is necessary but not sufficient to know - * whether this processor will actually generate recoverable - * machine checks. Check to see if this is an E7 model Xeon. - * We can't do a model number check because E5 and E7 use the - * same model number. E5 doesn't support recovery, E7 does. - */ - if (mca_cfg.recovery || (mca_cfg.ser && - !strncmp(c->x86_model_id, - "Intel(R) Xeon(R) CPU E7-", 24))) - set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY); } if (cfg->monarch_timeout < 0) cfg->monarch_timeout = 0; @@ -2080,6 +2095,7 @@ void mce_disable_bank(int bank) * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold + * mce=recovery force enable memcpy_mcsafe() */ static int __init mcheck_enable(char *str) { @@ -2676,8 +2692,14 @@ static int __init mcheck_debugfs_init(void) static int __init mcheck_debugfs_init(void) { return -EINVAL; } #endif +DEFINE_STATIC_KEY_FALSE(mcsafe_key); +EXPORT_SYMBOL_GPL(mcsafe_key); + static int __init mcheck_late_init(void) { + if (mca_cfg.recovery) + static_branch_inc(&mcsafe_key); + mcheck_debugfs_init(); /* diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 7b7f3be783d4..9b5403462936 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -20,6 +20,7 @@ #include <linux/init.h> #include <linux/cpu.h> #include <linux/smp.h> +#include <linux/string.h> #include <asm/amd_nb.h> #include <asm/apic.h> @@ -63,34 +64,71 @@ static const char * const th_names[] = { "execution_unit", }; -/* Define HWID to IP type mappings for Scalable MCA */ -struct amd_hwid amd_hwids[] = { - [SMCA_F17H_CORE] = { "f17h_core", 0xB0 }, - [SMCA_DF] = { "data_fabric", 0x2E }, - [SMCA_UMC] = { "umc", 0x96 }, - [SMCA_PB] = { "param_block", 0x5 }, - [SMCA_PSP] = { "psp", 0xFF }, - [SMCA_SMU] = { "smu", 0x1 }, +static const char * const smca_umc_block_names[] = { + "dram_ecc", + "misc_umc" }; -EXPORT_SYMBOL_GPL(amd_hwids); - -const char * const amd_core_mcablock_names[] = { - [SMCA_LS] = "load_store", - [SMCA_IF] = "insn_fetch", - [SMCA_L2_CACHE] = "l2_cache", - [SMCA_DE] = "decode_unit", - [RES] = "", - [SMCA_EX] = "execution_unit", - [SMCA_FP] = "floating_point", - [SMCA_L3_CACHE] = "l3_cache", + +struct smca_bank_name smca_bank_names[] = { + [SMCA_LS] = { "load_store", "Load Store Unit" }, + [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, + [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, + [SMCA_DE] = { "decode_unit", "Decode Unit" }, + [SMCA_EX] = { "execution_unit", "Execution Unit" }, + [SMCA_FP] = { "floating_point", "Floating Point Unit" }, + [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, + [SMCA_CS] = { "coherent_slave", "Coherent Slave" }, + [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, + [SMCA_UMC] = { "umc", "Unified Memory Controller" }, + [SMCA_PB] = { "param_block", "Parameter Block" }, + [SMCA_PSP] = { "psp", "Platform Security Processor" }, + [SMCA_SMU] = { "smu", "System Management Unit" }, }; -EXPORT_SYMBOL_GPL(amd_core_mcablock_names); +EXPORT_SYMBOL_GPL(smca_bank_names); + +static struct smca_hwid_mcatype smca_hwid_mcatypes[] = { + /* { bank_type, hwid_mcatype, xec_bitmap } */ + + /* ZN Core (HWID=0xB0) MCA types */ + { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF }, + { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, + { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF }, + { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF }, + /* HWID 0xB0 MCATYPE 0x4 is Reserved */ + { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF }, + { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F }, + { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF }, + + /* Data Fabric MCA types */ + { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, + { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF }, + + /* Unified Memory Controller MCA type */ + { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F }, + + /* Parameter Block MCA type */ + { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 }, + + /* Platform Security Processor MCA type */ + { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 }, -const char * const amd_df_mcablock_names[] = { - [SMCA_CS] = "coherent_slave", - [SMCA_PIE] = "pie", + /* System Management Unit MCA type */ + { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, }; -EXPORT_SYMBOL_GPL(amd_df_mcablock_names); + +struct smca_bank_info smca_banks[MAX_NR_BANKS]; +EXPORT_SYMBOL_GPL(smca_banks); + +/* + * In SMCA enabled processors, we can have multiple banks for a given IP type. + * So to define a unique name for each bank, we use a temp c-string to append + * the MCA_IPID[InstanceId] to type's name in get_name(). + * + * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN + * is greater than 8 plus 1 (for underscore) plus length of longest type name. + */ +#define MAX_MCATYPE_NAME_LEN 30 +static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ @@ -108,6 +146,36 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; * CPU Initialization */ +static void get_smca_bank_info(unsigned int bank) +{ + unsigned int i, hwid_mcatype, cpu = smp_processor_id(); + struct smca_hwid_mcatype *type; + u32 high, instanceId; + u16 hwid, mcatype; + + /* Collect bank_info using CPU 0 for now. */ + if (cpu) + return; + + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) { + pr_warn("Failed to read MCA_IPID for bank %d\n", bank); + return; + } + + hwid = high & MCI_IPID_HWID; + mcatype = (high & MCI_IPID_MCATYPE) >> 16; + hwid_mcatype = HWID_MCATYPE(hwid, mcatype); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { + type = &smca_hwid_mcatypes[i]; + if (hwid_mcatype == type->hwid_mcatype) { + smca_banks[bank].type = type; + smca_banks[bank].type_instance = instanceId; + break; + } + } +} + struct thresh_restart { struct threshold_block *b; int reset; @@ -293,7 +361,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 get_block_address(u32 current_addr, u32 low, u32 high, +static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block) { u32 addr = 0, offset = 0; @@ -309,13 +377,13 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, */ u32 low, high; - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) return addr; if (!(low & MCI_CONFIG_MCAX)) return addr; - if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && + if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && (low & MASK_BLKPTR_LO)) addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); } @@ -395,6 +463,20 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, */ smca_high &= ~BIT(2); + /* + * SMCA sets the Deferred Error Interrupt type per bank. + * + * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us + * if the DeferredIntType bit field is available. + * + * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the + * high portion of the MSR). OS should set this to 0x1 to enable + * APIC based interrupt. First, check that no interrupt has been + * set. + */ + if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3)) + smca_high |= BIT(5); + wrmsr(smca_addr, smca_low, smca_high); } @@ -421,12 +503,15 @@ out: void mce_amd_feature_init(struct cpuinfo_x86 *c) { u32 low = 0, high = 0, address = 0; - unsigned int bank, block; + unsigned int bank, block, cpu = smp_processor_id(); int offset = -1; for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (mce_flags.smca) + get_smca_bank_info(bank); + for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(address, low, high, bank, block); + address = get_block_address(cpu, address, low, high, bank, block); if (!address) break; @@ -476,9 +561,27 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) if (threshold_err) m.misc = misc; - if (m.status & MCI_STATUS_ADDRV) + if (m.status & MCI_STATUS_ADDRV) { rdmsrl(msr_addr, m.addr); + /* + * Extract [55:<lsb>] where lsb is the least significant + * *valid* bit of the address bits. + */ + if (mce_flags.smca) { + u8 lsb = (m.addr >> 56) & 0x3f; + + m.addr &= GENMASK_ULL(55, lsb); + } + } + + if (mce_flags.smca) { + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); + + if (m.status & MCI_STATUS_SYNDV) + rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + } + mce_log(&m); wrmsrl(msr_status, 0); @@ -541,15 +644,14 @@ static void amd_deferred_error_interrupt(void) static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; - int cpu = smp_processor_id(); - unsigned int bank, block; + unsigned int bank, block, cpu = smp_processor_id(); /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(address, low, high, bank, block); + address = get_block_address(cpu, address, low, high, bank, block); if (!address) break; @@ -713,6 +815,34 @@ static struct kobj_type threshold_ktype = { .default_attrs = default_attrs, }; +static const char *get_name(unsigned int bank, struct threshold_block *b) +{ + unsigned int bank_type; + + if (!mce_flags.smca) { + if (b && bank == 4) + return bank4_names(b); + + return th_names[bank]; + } + + if (!smca_banks[bank].type) + return NULL; + + bank_type = smca_banks[bank].type->bank_type; + + if (b && bank_type == SMCA_UMC) { + if (b->block < ARRAY_SIZE(smca_umc_block_names)) + return smca_umc_block_names[b->block]; + return NULL; + } + + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, + "%s_%x", smca_bank_names[bank_type].name, + smca_banks[bank].type_instance); + return buf_mcatype; +} + static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, unsigned int block, u32 address) { @@ -767,11 +897,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, err = kobject_init_and_add(&b->kobj, &threshold_ktype, per_cpu(threshold_banks, cpu)[bank]->kobj, - (bank == 4 ? bank4_names(b) : th_names[bank])); + get_name(bank, b)); if (err) goto out_free; recurse: - address = get_block_address(address, low, high, bank, ++block); + address = get_block_address(cpu, address, low, high, bank, ++block); if (!address) return 0; @@ -822,7 +952,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) struct device *dev = per_cpu(mce_device, cpu); struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; - const char *name = th_names[bank]; + const char *name = get_name(bank, NULL); int err = 0; if (is_shared_bank(bank)) { @@ -869,7 +999,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) } } - err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MCx_MISC(bank)); + err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank)); if (!err) goto out; diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index b816971f5da4..620ab06bcf45 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -54,6 +54,7 @@ static LIST_HEAD(pcache); */ static u8 *container; static size_t container_size; +static bool ucode_builtin; static u32 ucode_new_rev; static u8 amd_ucode_patch[PATCH_MAX_SIZE]; @@ -281,18 +282,22 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp, void __init load_ucode_amd_bsp(unsigned int family) { struct cpio_data cp; + bool *builtin; void **data; size_t *size; #ifdef CONFIG_X86_32 data = (void **)__pa_nodebug(&ucode_cpio.data); size = (size_t *)__pa_nodebug(&ucode_cpio.size); + builtin = (bool *)__pa_nodebug(&ucode_builtin); #else data = &ucode_cpio.data; size = &ucode_cpio.size; + builtin = &ucode_builtin; #endif - if (!load_builtin_amd_microcode(&cp, family)) + *builtin = load_builtin_amd_microcode(&cp, family); + if (!*builtin) cp = find_ucode_in_initrd(); if (!(cp.data && cp.size)) @@ -373,7 +378,8 @@ void load_ucode_amd_ap(void) return; /* Add CONFIG_RANDOMIZE_MEMORY offset. */ - cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; + if (!ucode_builtin) + cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; eax = cpuid_eax(0x00000001); eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ); @@ -439,7 +445,8 @@ int __init save_microcode_in_initrd_amd(void) container = cont_va; /* Add CONFIG_RANDOMIZE_MEMORY offset. */ - container += PAGE_OFFSET - __PAGE_OFFSET_BASE; + if (!ucode_builtin) + container += PAGE_OFFSET - __PAGE_OFFSET_BASE; eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index df04b2d033f6..5ce5155f0695 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -558,55 +558,36 @@ static struct syscore_ops mc_syscore_ops = { .resume = mc_bp_resume, }; -static int -mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +static int mc_cpu_online(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; struct device *dev; dev = get_cpu_device(cpu); + microcode_update_cpu(cpu); + pr_debug("CPU%d added\n", cpu); - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - microcode_update_cpu(cpu); - pr_debug("CPU%d added\n", cpu); - /* - * "break" is missing on purpose here because we want to fall - * through in order to create the sysfs group. - */ - - case CPU_DOWN_FAILED: - if (sysfs_create_group(&dev->kobj, &mc_attr_group)) - pr_err("Failed to create group for CPU%d\n", cpu); - break; + if (sysfs_create_group(&dev->kobj, &mc_attr_group)) + pr_err("Failed to create group for CPU%d\n", cpu); + return 0; +} - case CPU_DOWN_PREPARE: - /* Suspend is in progress, only remove the interface */ - sysfs_remove_group(&dev->kobj, &mc_attr_group); - pr_debug("CPU%d removed\n", cpu); - break; +static int mc_cpu_down_prep(unsigned int cpu) +{ + struct device *dev; + dev = get_cpu_device(cpu); + /* Suspend is in progress, only remove the interface */ + sysfs_remove_group(&dev->kobj, &mc_attr_group); + pr_debug("CPU%d removed\n", cpu); /* - * case CPU_DEAD: - * * When a CPU goes offline, don't free up or invalidate the copy of * the microcode in kernel memory, so that we can reuse it when the * CPU comes back online without unnecessarily requesting the userspace * for it again. */ - } - - /* The CPU refused to come up during a system resume */ - if (action == CPU_UP_CANCELED_FROZEN) - microcode_fini_cpu(cpu); - - return NOTIFY_OK; + return 0; } -static struct notifier_block mc_cpu_notifier = { - .notifier_call = mc_cpu_callback, -}; - static struct attribute *cpu_root_microcode_attrs[] = { &dev_attr_reload.attr, NULL @@ -665,7 +646,8 @@ int __init microcode_init(void) goto out_ucode_group; register_syscore_ops(&mc_syscore_ops); - register_hotcpu_notifier(&mc_cpu_notifier); + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", + mc_cpu_online, mc_cpu_down_prep); pr_info("Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n"); diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 28f1b54b7fad..24e87e74990d 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -72,14 +72,14 @@ static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; static bool mtrr_aps_delayed_init; -static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; +static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init; const struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(const struct mtrr_ops *ops) +void __init set_mtrr_ops(const struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 6c7ced07d16d..ad8bd763efa5 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -54,7 +54,7 @@ void fill_mtrr_var_range(unsigned int index, bool get_mtrr_state(void); void mtrr_bp_pat_init(void); -extern void set_mtrr_ops(const struct mtrr_ops *ops); +extern void __init set_mtrr_ops(const struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; extern const struct mtrr_ops *mtrr_if; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 92e8f0a7159c..9b7cf5c28f5f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -17,7 +17,7 @@ #include <linux/sysfs.h> #include <asm/stacktrace.h> - +#include <asm/unwind.h> int panic_on_unrecovered_nmi; int panic_on_io_nmi; @@ -25,11 +25,29 @@ unsigned int code_bytes = 64; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; +bool in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info) +{ + unsigned long *begin = task_stack_page(task); + unsigned long *end = task_stack_page(task) + THREAD_SIZE; + + if (stack < begin || stack >= end) + return false; + + info->type = STACK_TYPE_TASK; + info->begin = begin; + info->end = end; + info->next_sp = NULL; + + return true; +} + static void printk_stack_address(unsigned long address, int reliable, - void *data) + char *log_lvl) { + touch_nmi_watchdog(); printk("%s [<%p>] %s%pB\n", - (char *)data, (void *)address, reliable ? "" : "? ", + log_lvl, (void *)address, reliable ? "" : "? ", (void *)address); } @@ -38,176 +56,120 @@ void printk_address(unsigned long address) pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static void -print_ftrace_graph_addr(unsigned long addr, void *data, - const struct stacktrace_ops *ops, - struct task_struct *task, int *graph) +void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, char *log_lvl) { - unsigned long ret_addr; - int index; - - if (addr != (unsigned long)return_to_handler) - return; - - index = task->curr_ret_stack; - - if (!task->ret_stack || index < *graph) - return; - - index -= *graph; - ret_addr = task->ret_stack[index].ret; - - ops->address(data, ret_addr, 1); + struct unwind_state state; + struct stack_info stack_info = {0}; + unsigned long visit_mask = 0; + int graph_idx = 0; - (*graph)++; -} -#else -static inline void -print_ftrace_graph_addr(unsigned long addr, void *data, - const struct stacktrace_ops *ops, - struct task_struct *task, int *graph) -{ } -#endif - -/* - * x86-64 can have up to three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -static inline int valid_stack_ptr(struct task_struct *task, - void *p, unsigned int size, void *end) -{ - void *t = task_stack_page(task); - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p >= t && p < t + THREAD_SIZE - size; -} + printk("%sCall Trace:\n", log_lvl); -unsigned long -print_context_stack(struct task_struct *task, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph) -{ - struct stack_frame *frame = (struct stack_frame *)bp; + unwind_start(&state, task, regs, stack); /* - * If we overflowed the stack into a guard page, jump back to the - * bottom of the usable stack. + * Iterate through the stacks, starting with the current stack pointer. + * Each stack has a pointer to the next one. + * + * x86-64 can have several stacks: + * - task stack + * - interrupt stack + * - HW exception stacks (double fault, nmi, debug, mce) + * + * x86-32 can have up to three stacks: + * - task stack + * - softirq stack + * - hardirq stack */ - if ((unsigned long)task_stack_page(task) - (unsigned long)stack < - PAGE_SIZE) - stack = (unsigned long *)task_stack_page(task); - - while (valid_stack_ptr(task, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + sizeof(long)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, 0); - } - print_ftrace_graph_addr(addr, data, ops, task, graph); - } - stack++; - } - return bp; -} -EXPORT_SYMBOL_GPL(print_context_stack); - -unsigned long -print_context_stack_bp(struct task_struct *task, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - unsigned long *ret_addr = &frame->return_address; + for (; stack; stack = stack_info.next_sp) { + const char *str_begin, *str_end; - while (valid_stack_ptr(task, ret_addr, sizeof(*ret_addr), end)) { - unsigned long addr = *ret_addr; + /* + * If we overflowed the task stack into a guard page, jump back + * to the bottom of the usable stack. + */ + if (task_stack_page(task) - (void *)stack < PAGE_SIZE) + stack = task_stack_page(task); - if (!__kernel_text_address(addr)) + if (get_stack_info(stack, task, &stack_info, &visit_mask)) break; - if (ops->address(data, addr, 1)) - break; - frame = frame->next_frame; - ret_addr = &frame->return_address; - print_ftrace_graph_addr(addr, data, ops, task, graph); - } - - return (unsigned long)frame; -} -EXPORT_SYMBOL_GPL(print_context_stack_bp); - -static int print_trace_stack(void *data, char *name) -{ - printk("%s <%s> ", (char *)data, name); - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static int print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk_stack_address(addr, reliable, data); - return 0; -} - -static const struct stacktrace_ops print_trace_ops = { - .stack = print_trace_stack, - .address = print_trace_address, - .walk_stack = print_context_stack, -}; - -void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} + stack_type_str(stack_info.type, &str_begin, &str_end); + if (str_begin) + printk("%s <%s> ", log_lvl, str_begin); + + /* + * Scan the stack, printing any text addresses we find. At the + * same time, follow proper stack frames with the unwinder. + * + * Addresses found during the scan which are not reported by + * the unwinder are considered to be additional clues which are + * sometimes useful for debugging and are prefixed with '?'. + * This also serves as a failsafe option in case the unwinder + * goes off in the weeds. + */ + for (; stack < stack_info.end; stack++) { + unsigned long real_addr; + int reliable = 0; + unsigned long addr = *stack; + unsigned long *ret_addr_p = + unwind_get_return_address_ptr(&state); + + if (!__kernel_text_address(addr)) + continue; + + if (stack == ret_addr_p) + reliable = 1; + + /* + * When function graph tracing is enabled for a + * function, its return address on the stack is + * replaced with the address of an ftrace handler + * (return_to_handler). In that case, before printing + * the "real" address, we want to print the handler + * address as an "unreliable" hint that function graph + * tracing was involved. + */ + real_addr = ftrace_graph_ret_addr(task, &graph_idx, + addr, stack); + if (real_addr != addr) + printk_stack_address(addr, 0, log_lvl); + printk_stack_address(real_addr, reliable, log_lvl); + + if (!reliable) + continue; + + /* + * Get the next frame from the unwinder. No need to + * check for an error: if anything goes wrong, the rest + * of the addresses will just be printed as unreliable. + */ + unwind_next_frame(&state); + } -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); + if (str_end) + printk("%s <%s> ", log_lvl, str_end); + } } void show_stack(struct task_struct *task, unsigned long *sp) { - unsigned long bp = 0; - unsigned long stack; + task = task ? : current; /* * Stack frames below this one aren't interesting. Don't show them * if we're printing for %current. */ - if (!sp && (!task || task == current)) { - sp = &stack; - bp = stack_frame(current, NULL); - } + if (!sp && task == current) + sp = get_stack_pointer(current, NULL); - show_stack_log_lvl(task, NULL, sp, bp, ""); + show_stack_log_lvl(task, NULL, sp, ""); } void show_stack_regs(struct pt_regs *regs) { - show_stack_log_lvl(current, regs, (unsigned long *)regs->sp, regs->bp, ""); + show_stack_log_lvl(current, regs, NULL, ""); } static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 09675712eba8..06eb322b5f9f 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,93 +16,121 @@ #include <asm/stacktrace.h> -static void *is_irq_stack(void *p, void *irq) +void stack_type_str(enum stack_type type, const char **begin, const char **end) { - if (p < irq || p >= (irq + THREAD_SIZE)) - return NULL; - return irq + THREAD_SIZE; + switch (type) { + case STACK_TYPE_IRQ: + case STACK_TYPE_SOFTIRQ: + *begin = "IRQ"; + *end = "EOI"; + break; + default: + *begin = NULL; + *end = NULL; + } } - -static void *is_hardirq_stack(unsigned long *stack, int cpu) +static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - void *irq = per_cpu(hardirq_stack, cpu); + unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack); + unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); - return is_irq_stack(stack, irq); -} + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) + return false; -static void *is_softirq_stack(unsigned long *stack, int cpu) -{ - void *irq = per_cpu(softirq_stack, cpu); + info->type = STACK_TYPE_IRQ; + info->begin = begin; + info->end = end; - return is_irq_stack(stack, irq); + /* + * See irq_32.c -- the next stack pointer is stored at the beginning of + * the stack. + */ + info->next_sp = (unsigned long *)*begin; + + return true; } -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) +static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - const unsigned cpu = get_cpu(); - int graph = 0; - u32 *prev_esp; + unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack); + unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); - if (!task) - task = current; + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) + return false; - if (!stack) { - unsigned long dummy; + info->type = STACK_TYPE_SOFTIRQ; + info->begin = begin; + info->end = end; - stack = &dummy; - if (task != current) - stack = (unsigned long *)task->thread.sp; - } + /* + * The next stack pointer is stored at the beginning of the stack. + * See irq_32.c. + */ + info->next_sp = (unsigned long *)*begin; - if (!bp) - bp = stack_frame(task, regs); + return true; +} - for (;;) { - void *end_stack; +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + if (!stack) + goto unknown; - end_stack = is_hardirq_stack(stack, cpu); - if (!end_stack) - end_stack = is_softirq_stack(stack, cpu); + task = task ? : current; - bp = ops->walk_stack(task, stack, bp, ops, data, - end_stack, &graph); + if (in_task_stack(stack, task, info)) + goto recursion_check; - /* Stop if not on irq stack */ - if (!end_stack) - break; + if (task != current) + goto unknown; - /* The previous esp is saved on the bottom of the stack */ - prev_esp = (u32 *)(end_stack - THREAD_SIZE); - stack = (unsigned long *)*prev_esp; - if (!stack) - break; + if (in_hardirq_stack(stack, info)) + goto recursion_check; - if (ops->stack(data, "IRQ") < 0) - break; - touch_nmi_watchdog(); + if (in_softirq_stack(stack, info)) + goto recursion_check; + + goto unknown; + +recursion_check: + /* + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. + */ + if (visit_mask) { + if (*visit_mask & (1UL << info->type)) + goto unknown; + *visit_mask |= 1UL << info->type; } - put_cpu(); + + return 0; + +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; } -EXPORT_SYMBOL(dump_trace); -void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, char *log_lvl) { unsigned long *stack; int i; - if (sp == NULL) { - if (regs) - sp = (unsigned long *)regs->sp; - else if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } + if (!try_get_task_stack(task)) + return; + + sp = sp ? : get_stack_pointer(task, regs); stack = sp; for (i = 0; i < kstack_depth_to_print; i++) { @@ -117,7 +145,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, touch_nmi_watchdog(); } pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); + + put_task_stack(task); } @@ -139,7 +169,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; pr_emerg("Stack:\n"); - show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); + show_stack_log_lvl(current, regs, NULL, KERN_EMERG); pr_emerg("Code:"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 9ee4520ce83c..36cf1a498227 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,261 +16,145 @@ #include <asm/stacktrace.h> +static char *exception_stack_names[N_EXCEPTION_STACKS] = { + [ DOUBLEFAULT_STACK-1 ] = "#DF", + [ NMI_STACK-1 ] = "NMI", + [ DEBUG_STACK-1 ] = "#DB", + [ MCE_STACK-1 ] = "#MC", +}; -#define N_EXCEPTION_STACKS_END \ - (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) - -static char x86_stack_ids[][8] = { - [ DEBUG_STACK-1 ] = "#DB", - [ NMI_STACK-1 ] = "NMI", - [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ MCE_STACK-1 ] = "#MC", -#if DEBUG_STKSZ > EXCEPTION_STKSZ - [ N_EXCEPTION_STACKS ... - N_EXCEPTION_STACKS_END ] = "#DB[?]" -#endif +static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ }; -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) +void stack_type_str(enum stack_type type, const char **begin, const char **end) { - unsigned k; - - /* - * Iterate over all exception stacks, and figure out whether - * 'stack' is in one of them: - */ - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end = per_cpu(orig_ist, cpu).ist[k]; - /* - * Is 'stack' above this exception frame's end? - * If yes then skip to the next frame. - */ - if (stack >= end) - continue; - /* - * Is 'stack' above this exception frame's start address? - * If yes then we found the right frame. - */ - if (stack >= end - EXCEPTION_STKSZ) { - /* - * Make sure we only iterate through an exception - * stack once. If it comes up for the second time - * then there's something wrong going on - just - * break out and return NULL: - */ - if (*usedp & (1U << k)) - break; - *usedp |= 1U << k; - *idp = x86_stack_ids[k]; - return (unsigned long *)end; - } - /* - * If this is a debug stack, and if it has a larger size than - * the usual exception stacks, then 'stack' might still - * be within the lower portion of the debug stack: - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { - unsigned j = N_EXCEPTION_STACKS - 1; - - /* - * Black magic. A large debug stack is composed of - * multiple exception stack entries, which we - * iterate through now. Dont look: - */ - do { - ++j; - end -= EXCEPTION_STKSZ; - x86_stack_ids[j][4] = '1' + - (j - N_EXCEPTION_STACKS); - } while (stack < end - EXCEPTION_STKSZ); - if (*usedp & (1U << j)) - break; - *usedp |= 1U << j; - *idp = x86_stack_ids[j]; - return (unsigned long *)end; - } -#endif + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + + switch (type) { + case STACK_TYPE_IRQ: + *begin = "IRQ"; + *end = "EOI"; + break; + case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST: + *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION]; + *end = "EOE"; + break; + default: + *begin = NULL; + *end = NULL; } - return NULL; } -static inline int -in_irq_stack(unsigned long *stack, unsigned long *irq_stack, - unsigned long *irq_stack_end) +static bool in_exception_stack(unsigned long *stack, struct stack_info *info) { - return (stack >= irq_stack && stack < irq_stack_end); -} - -static const unsigned long irq_stack_size = - (IRQ_STACK_SIZE - 64) / sizeof(unsigned long); - -enum stack_type { - STACK_IS_UNKNOWN, - STACK_IS_NORMAL, - STACK_IS_EXCEPTION, - STACK_IS_IRQ, -}; - -static enum stack_type -analyze_stack(int cpu, struct task_struct *task, unsigned long *stack, - unsigned long **stack_end, unsigned long *irq_stack, - unsigned *used, char **id) -{ - unsigned long addr; + unsigned long *begin, *end; + struct pt_regs *regs; + unsigned k; - addr = ((unsigned long)stack & (~(THREAD_SIZE - 1))); - if ((unsigned long)task_stack_page(task) == addr) - return STACK_IS_NORMAL; + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); - *stack_end = in_exception_stack(cpu, (unsigned long)stack, - used, id); - if (*stack_end) - return STACK_IS_EXCEPTION; + for (k = 0; k < N_EXCEPTION_STACKS; k++) { + end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k]; + begin = end - (exception_stack_sizes[k] / sizeof(long)); + regs = (struct pt_regs *)end - 1; - if (!irq_stack) - return STACK_IS_NORMAL; + if (stack < begin || stack >= end) + continue; - *stack_end = irq_stack; - irq_stack = irq_stack - irq_stack_size; + info->type = STACK_TYPE_EXCEPTION + k; + info->begin = begin; + info->end = end; + info->next_sp = (unsigned long *)regs->sp; - if (in_irq_stack(stack, irq_stack, *stack_end)) - return STACK_IS_IRQ; + return true; + } - return STACK_IS_UNKNOWN; + return false; } -/* - * x86-64 can have up to three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) +static bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - const unsigned cpu = get_cpu(); - unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu); - unsigned long dummy; - unsigned used = 0; - int graph = 0; - int done = 0; - - if (!task) - task = current; - - if (!stack) { - if (regs) - stack = (unsigned long *)regs->sp; - else if (task != current) - stack = (unsigned long *)task->thread.sp; - else - stack = &dummy; - } + unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); + unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); - if (!bp) - bp = stack_frame(task, regs); /* - * Print function call entries in all stacks, starting at the - * current stack address. If the stacks consist of nested - * exceptions + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. */ - while (!done) { - unsigned long *stack_end; - enum stack_type stype; - char *id; + if (stack < begin || stack > end) + return false; - stype = analyze_stack(cpu, task, stack, &stack_end, - irq_stack, &used, &id); + info->type = STACK_TYPE_IRQ; + info->begin = begin; + info->end = end; - /* Default finish unless specified to continue */ - done = 1; + /* + * The next stack pointer is the first thing pushed by the entry code + * after switching to the irq stack. + */ + info->next_sp = (unsigned long *)*(end - 1); - switch (stype) { + return true; +} - /* Break out early if we are on the thread stack */ - case STACK_IS_NORMAL: - break; +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + if (!stack) + goto unknown; - case STACK_IS_EXCEPTION: + task = task ? : current; - if (ops->stack(data, id) < 0) - break; + if (in_task_stack(stack, task, info)) + goto recursion_check; - bp = ops->walk_stack(task, stack, bp, ops, - data, stack_end, &graph); - ops->stack(data, "<EOE>"); - /* - * We link to the next stack via the - * second-to-last pointer (index -2 to end) in the - * exception stack: - */ - stack = (unsigned long *) stack_end[-2]; - done = 0; - break; + if (task != current) + goto unknown; - case STACK_IS_IRQ: + if (in_exception_stack(stack, info)) + goto recursion_check; - if (ops->stack(data, "IRQ") < 0) - break; - bp = ops->walk_stack(task, stack, bp, - ops, data, stack_end, &graph); - /* - * We link to the next stack (which would be - * the process stack normally) the last - * pointer (index -1 to end) in the IRQ stack: - */ - stack = (unsigned long *) (stack_end[-1]); - irq_stack = NULL; - ops->stack(data, "EOI"); - done = 0; - break; + if (in_irq_stack(stack, info)) + goto recursion_check; - case STACK_IS_UNKNOWN: - ops->stack(data, "UNK"); - break; - } - } + goto unknown; +recursion_check: /* - * This handles the process stack: + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. */ - bp = ops->walk_stack(task, stack, bp, ops, data, NULL, &graph); - put_cpu(); + if (visit_mask) { + if (*visit_mask & (1UL << info->type)) + goto unknown; + *visit_mask |= 1UL << info->type; + } + + return 0; + +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; } -EXPORT_SYMBOL(dump_trace); -void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, char *log_lvl) { unsigned long *irq_stack_end; unsigned long *irq_stack; unsigned long *stack; - int cpu; int i; - preempt_disable(); - cpu = smp_processor_id(); + if (!try_get_task_stack(task)) + return; - irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); - irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); + irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr); + irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); - /* - * Debugging aid: "show_stack(NULL, NULL);" prints the - * back trace for this cpu: - */ - if (sp == NULL) { - if (regs) - sp = (unsigned long *)regs->sp; - else if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } + sp = sp ? : get_stack_pointer(task, regs); stack = sp; for (i = 0; i < kstack_depth_to_print; i++) { @@ -299,18 +183,17 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, stack++; touch_nmi_watchdog(); } - preempt_enable(); pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); + + put_task_stack(task); } void show_regs(struct pt_regs *regs) { int i; - unsigned long sp; - sp = regs->sp; show_regs_print_info(KERN_DEFAULT); __show_regs(regs, 1); @@ -325,8 +208,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; printk(KERN_DEFAULT "Stack:\n"); - show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - 0, KERN_DEFAULT); + show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT); printk(KERN_DEFAULT "Code: "); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 621b501f8935..b85fe5f91c3f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -40,8 +40,10 @@ * user can e.g. boot the original kernel with mem=1G while still booting the * next kernel with full memory. */ -struct e820map e820; -struct e820map e820_saved; +static struct e820map initial_e820 __initdata; +static struct e820map initial_e820_saved __initdata; +struct e820map *e820 __refdata = &initial_e820; +struct e820map *e820_saved __refdata = &initial_e820_saved; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -58,8 +60,8 @@ e820_any_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -81,8 +83,8 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -128,7 +130,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, void __init e820_add_region(u64 start, u64 size, int type) { - __e820_add_region(&e820, start, size, type); + __e820_add_region(e820, start, size, type); } static void __init e820_print_type(u32 type) @@ -164,12 +166,12 @@ void __init e820_print_map(char *who) { int i; - for (i = 0; i < e820.nr_map; i++) { + for (i = 0; i < e820->nr_map; i++) { printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, - (unsigned long long) e820.map[i].addr, + (unsigned long long) e820->map[i].addr, (unsigned long long) - (e820.map[i].addr + e820.map[i].size - 1)); - e820_print_type(e820.map[i].type); + (e820->map[i].addr + e820->map[i].size - 1)); + e820_print_type(e820->map[i].type); printk(KERN_CONT "\n"); } } @@ -348,7 +350,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, * continue building up new bios map based on this * information */ - if (current_type != last_type || current_type == E820_PRAM) { + if (current_type != last_type) { if (last_type != 0) { new_bios[new_bios_entry].size = change_point[chgidx]->addr - last_addr; @@ -388,11 +390,11 @@ static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) while (nr_map) { u64 start = biosmap->addr; u64 size = biosmap->size; - u64 end = start + size; + u64 end = start + size - 1; u32 type = biosmap->type; /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) + if (start > end && likely(size)) return -1; e820_add_region(start, size, type); @@ -493,13 +495,13 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return __e820_update_range(&e820, start, size, old_type, new_type); + return __e820_update_range(e820, start, size, old_type, new_type); } static u64 __init e820_update_range_saved(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return __e820_update_range(&e820_saved, start, size, old_type, + return __e820_update_range(e820_saved, start, size, old_type, new_type); } @@ -521,8 +523,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, e820_print_type(old_type); printk(KERN_CONT "\n"); - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; u64 final_start, final_end; u64 ei_end; @@ -566,15 +568,15 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, void __init update_e820(void) { - if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map)) + if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map)) return; printk(KERN_INFO "e820: modified physical RAM map:\n"); e820_print_map("modified"); } static void __init update_e820_saved(void) { - sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), - &e820_saved.nr_map); + sanitize_e820_map(e820_saved->map, ARRAY_SIZE(e820_saved->map), + &e820_saved->nr_map); } #define MAX_GAP_END 0x100000000ull /* @@ -584,14 +586,14 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, unsigned long start_addr, unsigned long long end_addr) { unsigned long long last; - int i = e820.nr_map; + int i = e820->nr_map; int found = 0; last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END; while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; + unsigned long long start = e820->map[i].addr; + unsigned long long end = start + e820->map[i].size; if (end < start_addr) continue; @@ -649,6 +651,33 @@ __init void e820_setup_gap(void) gapstart, gapstart + gapsize - 1); } +/* + * Called late during init, in free_initmem(). + * + * Initial e820 and e820_saved are largish __initdata arrays. + * Copy them to (usually much smaller) dynamically allocated area. + * This is done after all tweaks we ever do to them: + * all functions which modify them are __init functions, + * they won't exist after this point. + */ +__init void e820_reallocate_tables(void) +{ + struct e820map *n; + int size; + + size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820->nr_map; + n = kmalloc(size, GFP_KERNEL); + BUG_ON(!n); + memcpy(n, e820, size); + e820 = n; + + size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820_saved->nr_map; + n = kmalloc(size, GFP_KERNEL); + BUG_ON(!n); + memcpy(n, e820_saved, size); + e820_saved = n; +} + /** * Because of the size limitation of struct boot_params, only first * 128 E820 memory entries are passed to kernel via @@ -665,7 +694,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len) entries = sdata->len / sizeof(struct e820entry); extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); early_memunmap(sdata, data_len); printk(KERN_INFO "e820: extended physical RAM map:\n"); e820_print_map("extended"); @@ -686,8 +715,8 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) int i; unsigned long pfn = 0; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (pfn < PFN_UP(ei->addr)) register_nosave_region(pfn, PFN_UP(ei->addr)); @@ -712,8 +741,8 @@ static int __init e820_mark_nvs_memory(void) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (ei->type == E820_NVS) acpi_nvs_register(ei->addr, ei->size); @@ -754,22 +783,18 @@ u64 __init early_reserve_e820(u64 size, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820_end_pfn(unsigned long limit_pfn) +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) { int i; unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; unsigned long start_pfn; unsigned long end_pfn; - /* - * Persistent memory is accounted as ram for purposes of - * establishing max_pfn and mem_map. - */ - if (ei->type != E820_RAM && ei->type != E820_PRAM) + if (ei->type != type) continue; start_pfn = ei->addr >> PAGE_SHIFT; @@ -794,15 +819,15 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn) } unsigned long __init e820_end_of_ram_pfn(void) { - return e820_end_pfn(MAX_ARCH_PFN); + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); } unsigned long __init e820_end_of_low_ram_pfn(void) { - return e820_end_pfn(1UL << (32-PAGE_SHIFT)); + return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_RAM); } -static void early_panic(char *msg) +static void __init early_panic(char *msg) { early_printk(msg); panic(msg); @@ -856,7 +881,7 @@ static int __init parse_memmap_one(char *p) */ saved_max_pfn = e820_end_of_ram_pfn(); #endif - e820.nr_map = 0; + e820->nr_map = 0; userdef = 1; return 0; } @@ -903,8 +928,8 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { if (userdef) { - if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), - &e820.nr_map) < 0) + if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), + &e820->nr_map) < 0) early_panic("Invalid user supplied memory map"); printk(KERN_INFO "e820: user-defined physical RAM map:\n"); @@ -912,7 +937,7 @@ void __init finish_e820_parsing(void) } } -static const char *e820_type_to_string(int e820_type) +static const char *__init e820_type_to_string(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: @@ -926,7 +951,7 @@ static const char *e820_type_to_string(int e820_type) } } -static unsigned long e820_type_to_iomem_type(int e820_type) +static unsigned long __init e820_type_to_iomem_type(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: @@ -942,7 +967,7 @@ static unsigned long e820_type_to_iomem_type(int e820_type) } } -static unsigned long e820_type_to_iores_desc(int e820_type) +static unsigned long __init e820_type_to_iores_desc(int e820_type) { switch (e820_type) { case E820_ACPI: @@ -961,7 +986,7 @@ static unsigned long e820_type_to_iores_desc(int e820_type) } } -static bool do_mark_busy(u32 type, struct resource *res) +static bool __init do_mark_busy(u32 type, struct resource *res) { /* this is the legacy bios/dos rom-shadow + mmio region */ if (res->start < (1ULL<<20)) @@ -991,35 +1016,35 @@ void __init e820_reserve_resources(void) struct resource *res; u64 end; - res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); + res = alloc_bootmem(sizeof(struct resource) * e820->nr_map); e820_res = res; - for (i = 0; i < e820.nr_map; i++) { - end = e820.map[i].addr + e820.map[i].size - 1; + for (i = 0; i < e820->nr_map; i++) { + end = e820->map[i].addr + e820->map[i].size - 1; if (end != (resource_size_t)end) { res++; continue; } - res->name = e820_type_to_string(e820.map[i].type); - res->start = e820.map[i].addr; + res->name = e820_type_to_string(e820->map[i].type); + res->start = e820->map[i].addr; res->end = end; - res->flags = e820_type_to_iomem_type(e820.map[i].type); - res->desc = e820_type_to_iores_desc(e820.map[i].type); + res->flags = e820_type_to_iomem_type(e820->map[i].type); + res->desc = e820_type_to_iores_desc(e820->map[i].type); /* * don't register the region that could be conflicted with * pci device BAR resource and insert them later in * pcibios_resource_survey() */ - if (do_mark_busy(e820.map[i].type, res)) { + if (do_mark_busy(e820->map[i].type, res)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } res++; } - for (i = 0; i < e820_saved.nr_map; i++) { - struct e820entry *entry = &e820_saved.map[i]; + for (i = 0; i < e820_saved->nr_map; i++) { + struct e820entry *entry = &e820_saved->map[i]; firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry->type)); @@ -1027,7 +1052,7 @@ void __init e820_reserve_resources(void) } /* How much should we pad RAM ending depending on where it is? */ -static unsigned long ram_alignment(resource_size_t pos) +static unsigned long __init ram_alignment(resource_size_t pos) { unsigned long mb = pos >> 20; @@ -1051,7 +1076,7 @@ void __init e820_reserve_resources_late(void) struct resource *res; res = e820_res; - for (i = 0; i < e820.nr_map; i++) { + for (i = 0; i < e820->nr_map; i++) { if (!res->parent && res->end) insert_resource_expand_to_fit(&iomem_resource, res); res++; @@ -1061,8 +1086,8 @@ void __init e820_reserve_resources_late(void) * Try to bump up RAM regions to reasonable boundaries to * avoid stolen RAM: */ - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *entry = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *entry = &e820->map[i]; u64 start, end; if (entry->type != E820_RAM) @@ -1110,7 +1135,7 @@ char *__init default_machine_specific_memory_setup(void) who = "BIOS-e801"; } - e820.nr_map = 0; + e820->nr_map = 0; e820_add_region(0, LOWMEMSIZE(), E820_RAM); e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); } @@ -1124,7 +1149,7 @@ void __init setup_memory_map(void) char *who; who = x86_init.resources.memory_setup(); - memcpy(&e820_saved, &e820, sizeof(struct e820map)); + memcpy(e820_saved, e820, sizeof(struct e820map)); printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n"); e820_print_map(who); } @@ -1141,8 +1166,8 @@ void __init memblock_x86_fill(void) */ memblock_allow_resize(); - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; end = ei->addr + ei->size; if (end != (resource_size_t)end) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index de7501edb21c..18bb3a639197 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -555,7 +555,7 @@ intel_graphics_stolen(int num, int slot, int func, /* Mark this space as reserved */ e820_add_region(base, size, E820_RESERVED); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); } static void __init intel_graphics_quirks(int num, int slot, int func) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 93982aebb398..2f2b8c7ccb85 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -317,7 +317,6 @@ static void __init fpu__init_system_ctx_switch(void) on_boot_cpu = 0; WARN_ON_FPU(current->thread.fpu.fpstate_active); - current_thread_info()->status = 0; if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE) eagerfpu = ENABLE; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d036cfb4495d..8639bb2ae058 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1029,7 +1029,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, } if (ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer) == -EBUSY) { + frame_pointer, parent) == -EBUSY) { *parent = old; return; } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 6f8902b0d151..5f401262f12d 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -94,7 +94,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) */ __HEAD ENTRY(startup_32) - movl pa(stack_start),%ecx + movl pa(initial_stack),%ecx /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ @@ -286,7 +286,7 @@ num_subarch_entries = (. - subarch_entries) / 4 * start_secondary(). */ ENTRY(start_cpu0) - movl stack_start, %ecx + movl initial_stack, %ecx movl %ecx, %esp jmp *(initial_code) ENDPROC(start_cpu0) @@ -307,7 +307,7 @@ ENTRY(startup_32_smp) movl %eax,%es movl %eax,%fs movl %eax,%gs - movl pa(stack_start),%ecx + movl pa(initial_stack),%ecx movl %eax,%ss leal -__PAGE_OFFSET(%ecx),%esp @@ -703,7 +703,7 @@ ENTRY(initial_page_table) .data .balign 4 -ENTRY(stack_start) +ENTRY(initial_stack) .long init_thread_union+THREAD_SIZE __INITRODATA diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 9f8efc9f0075..c98a559c346e 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -66,7 +66,7 @@ startup_64: */ /* - * Setup stack for verify_cpu(). "-8" because stack_start is defined + * Setup stack for verify_cpu(). "-8" because initial_stack is defined * this way, see below. Our best guess is a NULL ptr for stack * termination heuristics and we don't want to break anything which * might depend on it (kgdb, ...). @@ -226,7 +226,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr0 /* Setup a boot time stack */ - movq stack_start(%rip), %rsp + movq initial_stack(%rip), %rsp /* zero EFLAGS after setting rsp */ pushq $0 @@ -310,7 +310,7 @@ ENDPROC(secondary_startup_64) * start_secondary(). */ ENTRY(start_cpu0) - movq stack_start(%rip),%rsp + movq initial_stack(%rip),%rsp movq initial_code(%rip),%rax pushq $0 # fake return address to stop unwinder pushq $__KERNEL_CS # set correct cs @@ -319,17 +319,15 @@ ENTRY(start_cpu0) ENDPROC(start_cpu0) #endif - /* SMP bootup changes these two */ + /* Both SMP bootup and ACPI suspend change these variables */ __REFDATA .balign 8 GLOBAL(initial_code) .quad x86_64_start_kernel GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - - GLOBAL(stack_start) + GLOBAL(initial_stack) .quad init_thread_union+THREAD_SIZE-8 - .word 0 __FINITDATA bad_address: diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index c6dfd801df97..274fab99169d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -756,10 +756,104 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd) /* * Clock source related code */ +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) +/* + * Reading the HPET counter is a very slow operation. If a large number of + * CPUs are trying to access the HPET counter simultaneously, it can cause + * massive delay and slow down system performance dramatically. This may + * happen when HPET is the default clock source instead of TSC. For a + * really large system with hundreds of CPUs, the slowdown may be so + * severe that it may actually crash the system because of a NMI watchdog + * soft lockup, for example. + * + * If multiple CPUs are trying to access the HPET counter at the same time, + * we don't actually need to read the counter multiple times. Instead, the + * other CPUs can use the counter value read by the first CPU in the group. + * + * This special feature is only enabled on x86-64 systems. It is unlikely + * that 32-bit x86 systems will have enough CPUs to require this feature + * with its associated locking overhead. And we also need 64-bit atomic + * read. + * + * The lock and the hpet value are stored together and can be read in a + * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t + * is 32 bits in size. + */ +union hpet_lock { + struct { + arch_spinlock_t lock; + u32 value; + }; + u64 lockval; +}; + +static union hpet_lock hpet __cacheline_aligned = { + { .lock = __ARCH_SPIN_LOCK_UNLOCKED, }, +}; + +static cycle_t read_hpet(struct clocksource *cs) +{ + unsigned long flags; + union hpet_lock old, new; + + BUILD_BUG_ON(sizeof(union hpet_lock) != 8); + + /* + * Read HPET directly if in NMI. + */ + if (in_nmi()) + return (cycle_t)hpet_readl(HPET_COUNTER); + + /* + * Read the current state of the lock and HPET value atomically. + */ + old.lockval = READ_ONCE(hpet.lockval); + + if (arch_spin_is_locked(&old.lock)) + goto contended; + + local_irq_save(flags); + if (arch_spin_trylock(&hpet.lock)) { + new.value = hpet_readl(HPET_COUNTER); + /* + * Use WRITE_ONCE() to prevent store tearing. + */ + WRITE_ONCE(hpet.value, new.value); + arch_spin_unlock(&hpet.lock); + local_irq_restore(flags); + return (cycle_t)new.value; + } + local_irq_restore(flags); + +contended: + /* + * Contended case + * -------------- + * Wait until the HPET value change or the lock is free to indicate + * its value is up-to-date. + * + * It is possible that old.value has already contained the latest + * HPET value while the lock holder was in the process of releasing + * the lock. Checking for lock state change will enable us to return + * the value immediately instead of waiting for the next HPET reader + * to come along. + */ + do { + cpu_relax(); + new.lockval = READ_ONCE(hpet.lockval); + } while ((new.value == old.value) && arch_spin_is_locked(&new.lock)); + + return (cycle_t)new.value; +} +#else +/* + * For UP or 32-bit. + */ static cycle_t read_hpet(struct clocksource *cs) { return (cycle_t)hpet_readl(HPET_COUNTER); } +#endif static struct clocksource clocksource_hpet = { .name = "hpet", diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 4a7903714065..9ebd0b0e73d9 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -40,8 +40,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (user_mode(regs)) return; - if (regs->sp >= curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + STACK_TOP_MARGIN && + if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN && regs->sp <= curbase + THREAD_SIZE) return; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index f2356bda2b05..3407b148c240 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -99,14 +99,14 @@ static int setup_e820_entries(struct boot_params *params) { unsigned int nr_e820_entries; - nr_e820_entries = e820_saved.nr_map; + nr_e820_entries = e820_saved->nr_map; /* TODO: Pass entries more than E820MAX in bootparams setup data */ if (nr_e820_entries > E820MAX) nr_e820_entries = E820MAX; params->e820_entries = nr_e820_entries; - memcpy(¶ms->e820_map, &e820_saved.map, + memcpy(¶ms->e820_map, &e820_saved->map, nr_e820_entries * sizeof(struct e820entry)); return 0; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 04cde527d728..8e36f249646e 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -50,6 +50,7 @@ #include <asm/apicdef.h> #include <asm/apic.h> #include <asm/nmi.h> +#include <asm/switch_to.h> struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { @@ -166,21 +167,19 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_DX] = 0; gdb_regs[GDB_SI] = 0; gdb_regs[GDB_DI] = 0; - gdb_regs[GDB_BP] = *(unsigned long *)p->thread.sp; + gdb_regs[GDB_BP] = ((struct inactive_task_frame *)p->thread.sp)->bp; #ifdef CONFIG_X86_32 gdb_regs[GDB_DS] = __KERNEL_DS; gdb_regs[GDB_ES] = __KERNEL_DS; gdb_regs[GDB_PS] = 0; gdb_regs[GDB_CS] = __KERNEL_CS; - gdb_regs[GDB_PC] = p->thread.ip; gdb_regs[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; #else - gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); + gdb_regs32[GDB_PS] = 0; gdb_regs32[GDB_CS] = __KERNEL_CS; gdb_regs32[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_PC] = 0; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; @@ -190,6 +189,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_R14] = 0; gdb_regs[GDB_R15] = 0; #endif + gdb_regs[GDB_PC] = 0; gdb_regs[GDB_SP] = p->thread.sp; } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 7847e5c0e0b5..28cee019209c 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -45,7 +45,7 @@ #include <linux/slab.h> #include <linux/hardirq.h> #include <linux/preempt.h> -#include <linux/module.h> +#include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> #include <linux/ftrace.h> diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 4425f593f0ec..3bb4c5f021f6 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -24,7 +24,7 @@ #include <linux/slab.h> #include <linux/hardirq.h> #include <linux/preempt.h> -#include <linux/module.h> +#include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> #include <linux/ftrace.h> diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index c2bedaea11f7..4afc67f5facc 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -184,7 +184,7 @@ out: static struct kobj_attribute type_attr = __ATTR_RO(type); -static struct bin_attribute data_attr = { +static struct bin_attribute data_attr __ro_after_init = { .attr = { .name = "data", .mode = S_IRUGO, diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 1726c4c12336..edbbfc854e39 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -423,12 +423,7 @@ static void __init kvm_smp_prepare_boot_cpu(void) kvm_spinlock_init(); } -static void kvm_guest_cpu_online(void *dummy) -{ - kvm_guest_cpu_init(); -} - -static void kvm_guest_cpu_offline(void *dummy) +static void kvm_guest_cpu_offline(void) { kvm_disable_steal_time(); if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) @@ -437,29 +432,21 @@ static void kvm_guest_cpu_offline(void *dummy) apf_task_wake_all(); } -static int kvm_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) +static int kvm_cpu_online(unsigned int cpu) { - int cpu = (unsigned long)hcpu; - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - case CPU_ONLINE_FROZEN: - smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); - break; - default: - break; - } - return NOTIFY_OK; + local_irq_disable(); + kvm_guest_cpu_init(); + local_irq_enable(); + return 0; } -static struct notifier_block kvm_cpu_notifier = { - .notifier_call = kvm_cpu_notify, -}; +static int kvm_cpu_down_prepare(unsigned int cpu) +{ + local_irq_disable(); + kvm_guest_cpu_offline(); + local_irq_enable(); + return 0; +} #endif static void __init kvm_apf_trap_init(void) @@ -494,7 +481,9 @@ void __init kvm_guest_init(void) #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; - register_cpu_notifier(&kvm_cpu_notifier); + if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", + kvm_cpu_online, kvm_cpu_down_prepare) < 0) + pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); #else kvm_guest_cpu_init(); #endif @@ -575,9 +564,6 @@ static void kvm_kick_cpu(int cpu) kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); } - -#ifdef CONFIG_QUEUED_SPINLOCKS - #include <asm/qspinlock.h> static void kvm_wait(u8 *ptr, u8 val) @@ -606,243 +592,6 @@ out: local_irq_restore(flags); } -#else /* !CONFIG_QUEUED_SPINLOCKS */ - -enum kvm_contention_stat { - TAKEN_SLOW, - TAKEN_SLOW_PICKUP, - RELEASED_SLOW, - RELEASED_SLOW_KICKED, - NR_CONTENTION_STATS -}; - -#ifdef CONFIG_KVM_DEBUG_FS -#define HISTO_BUCKETS 30 - -static struct kvm_spinlock_stats -{ - u32 contention_stats[NR_CONTENTION_STATS]; - u32 histo_spin_blocked[HISTO_BUCKETS+1]; - u64 time_blocked; -} spinlock_stats; - -static u8 zero_stats; - -static inline void check_zero(void) -{ - u8 ret; - u8 old; - - old = READ_ONCE(zero_stats); - if (unlikely(old)) { - ret = cmpxchg(&zero_stats, old, 0); - /* This ensures only one fellow resets the stat */ - if (ret == old) - memset(&spinlock_stats, 0, sizeof(spinlock_stats)); - } -} - -static inline void add_stats(enum kvm_contention_stat var, u32 val) -{ - check_zero(); - spinlock_stats.contention_stats[var] += val; -} - - -static inline u64 spin_time_start(void) -{ - return sched_clock(); -} - -static void __spin_time_accum(u64 delta, u32 *array) -{ - unsigned index; - - index = ilog2(delta); - check_zero(); - - if (index < HISTO_BUCKETS) - array[index]++; - else - array[HISTO_BUCKETS]++; -} - -static inline void spin_time_accum_blocked(u64 start) -{ - u32 delta; - - delta = sched_clock() - start; - __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); - spinlock_stats.time_blocked += delta; -} - -static struct dentry *d_spin_debug; -static struct dentry *d_kvm_debug; - -static struct dentry *kvm_init_debugfs(void) -{ - d_kvm_debug = debugfs_create_dir("kvm-guest", NULL); - if (!d_kvm_debug) - printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n"); - - return d_kvm_debug; -} - -static int __init kvm_spinlock_debugfs(void) -{ - struct dentry *d_kvm; - - d_kvm = kvm_init_debugfs(); - if (d_kvm == NULL) - return -ENOMEM; - - d_spin_debug = debugfs_create_dir("spinlocks", d_kvm); - - debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); - - debugfs_create_u32("taken_slow", 0444, d_spin_debug, - &spinlock_stats.contention_stats[TAKEN_SLOW]); - debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, - &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); - - debugfs_create_u32("released_slow", 0444, d_spin_debug, - &spinlock_stats.contention_stats[RELEASED_SLOW]); - debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, - &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); - - debugfs_create_u64("time_blocked", 0444, d_spin_debug, - &spinlock_stats.time_blocked); - - debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, - spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); - - return 0; -} -fs_initcall(kvm_spinlock_debugfs); -#else /* !CONFIG_KVM_DEBUG_FS */ -static inline void add_stats(enum kvm_contention_stat var, u32 val) -{ -} - -static inline u64 spin_time_start(void) -{ - return 0; -} - -static inline void spin_time_accum_blocked(u64 start) -{ -} -#endif /* CONFIG_KVM_DEBUG_FS */ - -struct kvm_lock_waiting { - struct arch_spinlock *lock; - __ticket_t want; -}; - -/* cpus 'waiting' on a spinlock to become available */ -static cpumask_t waiting_cpus; - -/* Track spinlock on which a cpu is waiting */ -static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting); - -__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) -{ - struct kvm_lock_waiting *w; - int cpu; - u64 start; - unsigned long flags; - __ticket_t head; - - if (in_nmi()) - return; - - w = this_cpu_ptr(&klock_waiting); - cpu = smp_processor_id(); - start = spin_time_start(); - - /* - * Make sure an interrupt handler can't upset things in a - * partially setup state. - */ - local_irq_save(flags); - - /* - * The ordering protocol on this is that the "lock" pointer - * may only be set non-NULL if the "want" ticket is correct. - * If we're updating "want", we must first clear "lock". - */ - w->lock = NULL; - smp_wmb(); - w->want = want; - smp_wmb(); - w->lock = lock; - - add_stats(TAKEN_SLOW, 1); - - /* - * This uses set_bit, which is atomic but we should not rely on its - * reordering gurantees. So barrier is needed after this call. - */ - cpumask_set_cpu(cpu, &waiting_cpus); - - barrier(); - - /* - * Mark entry to slowpath before doing the pickup test to make - * sure we don't deadlock with an unlocker. - */ - __ticket_enter_slowpath(lock); - - /* make sure enter_slowpath, which is atomic does not cross the read */ - smp_mb__after_atomic(); - - /* - * check again make sure it didn't become free while - * we weren't looking. - */ - head = READ_ONCE(lock->tickets.head); - if (__tickets_equal(head, want)) { - add_stats(TAKEN_SLOW_PICKUP, 1); - goto out; - } - - /* - * halt until it's our turn and kicked. Note that we do safe halt - * for irq enabled case to avoid hang when lock info is overwritten - * in irq spinlock slowpath and no spurious interrupt occur to save us. - */ - if (arch_irqs_disabled_flags(flags)) - halt(); - else - safe_halt(); - -out: - cpumask_clear_cpu(cpu, &waiting_cpus); - w->lock = NULL; - local_irq_restore(flags); - spin_time_accum_blocked(start); -} -PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning); - -/* Kick vcpu waiting on @lock->head to reach value @ticket */ -static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) -{ - int cpu; - - add_stats(RELEASED_SLOW, 1); - for_each_cpu(cpu, &waiting_cpus) { - const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); - if (READ_ONCE(w->lock) == lock && - READ_ONCE(w->want) == ticket) { - add_stats(RELEASED_SLOW_KICKED, 1); - kvm_kick_cpu(cpu); - break; - } - } -} - -#endif /* !CONFIG_QUEUED_SPINLOCKS */ - /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -854,16 +603,11 @@ void __init kvm_spinlock_init(void) if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; -#ifdef CONFIG_QUEUED_SPINLOCKS __pv_init_lock_hash(); pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); pv_lock_ops.wait = kvm_wait; pv_lock_ops.kick = kvm_kick_cpu; -#else /* !CONFIG_QUEUED_SPINLOCKS */ - pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); - pv_lock_ops.unlock_kick = kvm_unlock_kick; -#endif } static __init int kvm_spinlock_init_jump(void) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1d39bfbd26bb..60b9949f1e65 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -29,7 +29,7 @@ #include <asm/x86_init.h> #include <asm/reboot.h> -static int kvmclock = 1; +static int kvmclock __ro_after_init = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; static cycle_t kvm_sched_clock_offset; @@ -289,6 +289,7 @@ void __init kvmclock_init(void) put_cpu(); x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.calibrate_cpu = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 068c4a929de6..0f8d20497383 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -499,6 +499,9 @@ void __init default_get_smp_config(unsigned int early) { struct mpf_intel *mpf = mpf_found; + if (!smp_found_config) + return; + if (!mpf) return; diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 1939a0269377..2c55a003b793 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -8,7 +8,6 @@ #include <asm/paravirt.h> -#ifdef CONFIG_QUEUED_SPINLOCKS __visible void __native_queued_spin_unlock(struct qspinlock *lock) { native_queued_spin_unlock(lock); @@ -21,19 +20,13 @@ bool pv_is_native_spin_unlock(void) return pv_lock_ops.queued_spin_unlock.func == __raw_callee_save___native_queued_spin_unlock; } -#endif struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP -#ifdef CONFIG_QUEUED_SPINLOCKS .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), .wait = paravirt_nop, .kick = paravirt_nop, -#else /* !CONFIG_QUEUED_SPINLOCKS */ - .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), - .unlock_kick = paravirt_nop, -#endif /* !CONFIG_QUEUED_SPINLOCKS */ #endif /* SMP */ }; EXPORT_SYMBOL(pv_lock_ops); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1acfd76e3e26..bbf3d5933eaa 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -332,7 +332,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, .read_cr4 = native_read_cr4, - .read_cr4_safe = native_read_cr4_safe, .write_cr4 = native_write_cr4, #ifdef CONFIG_X86_64 .read_cr8 = native_read_cr8, @@ -389,7 +388,7 @@ NOKPROBE_SYMBOL(native_load_idt); #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) #endif -struct pv_mmu_ops pv_mmu_ops = { +struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 158dc0650d5d..920c6ae08592 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -10,7 +10,7 @@ DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); #endif @@ -49,7 +49,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); PATCH_SITE(pv_cpu_ops, clts); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): if (pv_is_native_spin_unlock()) { start = start_pv_lock_ops_queued_spin_unlock; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index e70087a04cc8..bb3840cedb4f 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -19,7 +19,7 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); DEF_NATIVE(, mov32, "mov %edi, %eax"); DEF_NATIVE(, mov64, "mov %rdi, %rax"); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); #endif @@ -61,7 +61,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): if (pv_is_native_spin_unlock()) { start = start_pv_lock_ops_queued_spin_unlock; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 62c0b0ea2ce4..4002b475171c 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -32,6 +32,7 @@ #include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/vm86.h> +#include <asm/switch_to.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -513,6 +514,17 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) } /* + * Return saved PC of a blocked thread. + * What is this good for? it will be always the scheduler or ret_from_fork. + */ +unsigned long thread_saved_pc(struct task_struct *tsk) +{ + struct inactive_task_frame *frame = + (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp); + return READ_ONCE_NOCHECK(frame->ret_addr); +} + +/* * Called from fs/proc with a reference on @p to find the function * which called into schedule(). This needs to be done carefully * because the task might wake up and we might look at a stack @@ -520,15 +532,18 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) */ unsigned long get_wchan(struct task_struct *p) { - unsigned long start, bottom, top, sp, fp, ip; + unsigned long start, bottom, top, sp, fp, ip, ret = 0; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; + if (!try_get_task_stack(p)) + return 0; + start = (unsigned long)task_stack_page(p); if (!start) - return 0; + goto out; /* * Layout of the stack page: @@ -537,9 +552,7 @@ unsigned long get_wchan(struct task_struct *p) * PADDING * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING * stack - * ----------- bottom = start + sizeof(thread_info) - * thread_info - * ----------- start + * ----------- bottom = start * * The tasks stack pointer points at the location where the * framepointer is stored. The data on the stack is: @@ -550,20 +563,25 @@ unsigned long get_wchan(struct task_struct *p) */ top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; top -= 2 * sizeof(unsigned long); - bottom = start + sizeof(struct thread_info); + bottom = start; sp = READ_ONCE(p->thread.sp); if (sp < bottom || sp > top) - return 0; + goto out; - fp = READ_ONCE_NOCHECK(*(unsigned long *)sp); + fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); do { if (fp < bottom || fp > top) - return 0; + goto out; ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); - if (!in_sched_functions(ip)) - return ip; + if (!in_sched_functions(ip)) { + ret = ip; + goto out; + } fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); } while (count++ < 16 && p->state != TASK_RUNNING); - return 0; + +out: + put_task_stack(p); + return ret; } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index d86be29c38c7..bd7be8efdc4c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -55,17 +55,6 @@ #include <asm/switch_to.h> #include <asm/vm86.h> -asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread"); - -/* - * Return saved PC of a blocked thread. - */ -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - return ((unsigned long *)tsk->thread.sp)[3]; -} - void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -101,7 +90,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); - cr4 = __read_cr4_safe(); + cr4 = __read_cr4(); printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); @@ -133,35 +122,31 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); + struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs); + struct inactive_task_frame *frame = &fork_frame->frame; struct task_struct *tsk; int err; - p->thread.sp = (unsigned long) childregs; + frame->bp = 0; + frame->ret_addr = (unsigned long) ret_from_fork; + p->thread.sp = (unsigned long) fork_frame; p->thread.sp0 = (unsigned long) (childregs+1); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - p->thread.ip = (unsigned long) ret_from_kernel_thread; - task_user_gs(p) = __KERNEL_STACK_CANARY; - childregs->ds = __USER_DS; - childregs->es = __USER_DS; - childregs->fs = __KERNEL_PERCPU; - childregs->bx = sp; /* function */ - childregs->bp = arg; - childregs->orig_ax = -1; - childregs->cs = __KERNEL_CS | get_kernel_rpl(); - childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; + frame->bx = sp; /* function */ + frame->di = arg; p->thread.io_bitmap_ptr = NULL; return 0; } + frame->bx = 0; *childregs = *current_pt_regs(); childregs->ax = 0; if (sp) childregs->sp = sp; - p->thread.ip = (unsigned long) ret_from_fork; task_user_gs(p) = get_user_gs(current_pt_regs()); p->thread.io_bitmap_ptr = NULL; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 63236d8f84bf..ee944bd2310d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -49,8 +49,7 @@ #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> - -asmlinkage extern void ret_from_fork(void); +#include <asm/vdso.h> __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); @@ -141,12 +140,17 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, { int err; struct pt_regs *childregs; + struct fork_frame *fork_frame; + struct inactive_task_frame *frame; struct task_struct *me = current; p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); - p->thread.sp = (unsigned long) childregs; - set_tsk_thread_flag(p, TIF_FORK); + fork_frame = container_of(childregs, struct fork_frame, regs); + frame = &fork_frame->frame; + frame->bp = 0; + frame->ret_addr = (unsigned long) ret_from_fork; + p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); @@ -160,15 +164,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - childregs->sp = (unsigned long)childregs; - childregs->ss = __KERNEL_DS; - childregs->bx = sp; /* function */ - childregs->bp = arg; - childregs->orig_ax = -1; - childregs->cs = __KERNEL_CS | get_kernel_rpl(); - childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; + frame->bx = sp; /* function */ + frame->r12 = arg; return 0; } + frame->bx = 0; *childregs = *current_pt_regs(); childregs->ax = 0; @@ -511,7 +511,7 @@ void set_personality_ia32(bool x32) current->personality &= ~READ_IMPLIES_EXEC; /* in_compat_syscall() uses the presence of the x32 syscall bit flag to determine compat status */ - current_thread_info()->status &= ~TS_COMPAT; + current->thread.status &= ~TS_COMPAT; } else { set_thread_flag(TIF_IA32); clear_thread_flag(TIF_X32); @@ -519,11 +519,24 @@ void set_personality_ia32(bool x32) current->mm->context.ia32_compat = TIF_IA32; current->personality |= force_personality32; /* Prepare the first "return" to user space */ - current_thread_info()->status |= TS_COMPAT; + current->thread.status |= TS_COMPAT; } } EXPORT_SYMBOL_GPL(set_personality_ia32); +#ifdef CONFIG_CHECKPOINT_RESTORE +static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) +{ + int ret; + + ret = map_vdso_once(image, addr); + if (ret) + return ret; + + return (long)image->size; +} +#endif + long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) { int ret = 0; @@ -577,6 +590,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) break; } +#ifdef CONFIG_CHECKPOINT_RESTORE +# ifdef CONFIG_X86_X32_ABI + case ARCH_MAP_VDSO_X32: + return prctl_map_vdso(&vdso_image_x32, addr); +# endif +# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + case ARCH_MAP_VDSO_32: + return prctl_map_vdso(&vdso_image_32, addr); +# endif + case ARCH_MAP_VDSO_64: + return prctl_map_vdso(&vdso_image_64, addr); +#endif + default: ret = -EINVAL; break; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index f79576a541ff..0e63c0267f99 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -173,8 +173,8 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs) return sp; prev_esp = (u32 *)(context); - if (prev_esp) - return (unsigned long)prev_esp; + if (*prev_esp) + return (unsigned long)*prev_esp; return (unsigned long)regs; } @@ -934,7 +934,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) */ regs->orig_ax = value; if (syscall_get_nr(child, regs) >= 0) - task_thread_info(child)->status |= TS_I386_REGS_POKED; + child->thread.status |= TS_I386_REGS_POKED; break; case offsetof(struct user32, regs.eflags): @@ -1250,7 +1250,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, #ifdef CONFIG_X86_64 -static struct user_regset x86_64_regsets[] __read_mostly = { +static struct user_regset x86_64_regsets[] __ro_after_init = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), @@ -1291,7 +1291,7 @@ static const struct user_regset_view user_x86_64_view = { #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION -static struct user_regset x86_32_regsets[] __read_mostly = { +static struct user_regset x86_32_regsets[] __ro_after_init = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), @@ -1344,7 +1344,7 @@ static const struct user_regset_view user_x86_32_view = { */ u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -void update_regset_xstate_info(unsigned int size, u64 xstate_mask) +void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) { #ifdef CONFIG_X86_64 x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); @@ -1358,7 +1358,7 @@ void update_regset_xstate_info(unsigned int size, u64 xstate_mask) const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) + if (!user_64bit_mode(task_pt_regs(task))) #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION return &user_x86_32_view; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index cc457ff818ad..51402a7e4ca6 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -626,3 +626,34 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, amd_disable_seq_and_redirect_scrub); #endif + +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +#include <linux/jump_label.h> +#include <asm/string_64.h> + +/* Ivy Bridge, Haswell, Broadwell */ +static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) +{ + u32 capid0; + + pci_read_config_dword(pdev, 0x84, &capid0); + + if (capid0 & 0x10) + static_branch_inc(&mcsafe_key); +} + +/* Skylake */ +static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) +{ + u32 capid0; + + pci_read_config_dword(pdev, 0x84, &capid0); + + if ((capid0 & 0xc0) == 0xc0) + static_branch_inc(&mcsafe_key); +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); +#endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 63bf27d972b7..e244c19a2451 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -705,7 +705,7 @@ static void native_machine_power_off(void) tboot_shutdown(TB_SHUTDOWN_HALT); } -struct machine_ops machine_ops = { +struct machine_ops machine_ops __ro_after_init = { .power_off = native_machine_power_off, .shutdown = native_machine_shutdown, .emergency_restart = native_machine_emergency_restart, diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 80eab01c1a68..2408c1603438 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -27,8 +27,8 @@ static void remove_e820_regions(struct resource *avail) int i; struct e820entry *entry; - for (i = 0; i < e820.nr_map; i++) { - entry = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + entry = &e820->map[i]; resource_clip(avail, entry->addr, entry->addr + entry->size - 1); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa60f5f5a16..bbfbca5fea0c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -210,9 +210,9 @@ EXPORT_SYMBOL(boot_cpu_data); #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -__visible unsigned long mmu_cr4_features; +__visible unsigned long mmu_cr4_features __ro_after_init; #else -__visible unsigned long mmu_cr4_features = X86_CR4_PAE; +__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE; #endif /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ @@ -458,8 +458,8 @@ static void __init e820_reserve_setup_data(void) early_memunmap(data, sizeof(*data)); } - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - memcpy(&e820_saved, &e820, sizeof(struct e820map)); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + memcpy(e820_saved, e820, sizeof(struct e820map)); printk(KERN_INFO "extended physical RAM map:\n"); e820_print_map("reserve setup_data"); } @@ -763,7 +763,7 @@ static void __init trim_bios_range(void) */ e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); } /* called before trim_bios_range() to spare extra sanitize */ @@ -1032,7 +1032,7 @@ void __init setup_arch(char **cmdline_p) if (ppro_with_ram_bug()) { e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, E820_RESERVED); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); printk(KERN_INFO "fixed physical RAM map:\n"); e820_print_map("bad_ppro"); } @@ -1096,19 +1096,19 @@ void __init setup_arch(char **cmdline_p) memblock_set_current_limit(ISA_END_ADDRESS); memblock_x86_fill(); - if (efi_enabled(EFI_BOOT)) { + reserve_bios_regions(); + + if (efi_enabled(EFI_MEMMAP)) { efi_fake_memmap(); efi_find_mirror(); - } - - reserve_bios_regions(); + efi_esrt_init(); - /* - * The EFI specification says that boot service code won't be called - * after ExitBootServices(). This is, in fact, a lie. - */ - if (efi_enabled(EFI_MEMMAP)) + /* + * The EFI specification says that boot service code won't be + * called after ExitBootServices(). This is, in fact, a lie. + */ efi_reserve_boot_services(); + } /* preallocate 4k for mptable mpc */ early_reserve_e820_mpc_new(); @@ -1137,9 +1137,7 @@ void __init setup_arch(char **cmdline_p) * auditing all the early-boot CR4 manipulation would be needed to * rule it out. */ - if (boot_cpu_data.cpuid_level >= 0) - /* A CPU has %cr4 if and only if it has CPUID. */ - mmu_cr4_features = __read_cr4(); + mmu_cr4_features = __read_cr4(); memblock_set_current_limit(get_max_mapped()); @@ -1221,8 +1219,7 @@ void __init setup_arch(char **cmdline_p) /* * get boot-time SMP configuration: */ - if (smp_found_config) - get_smp_config(); + get_smp_config(); prefill_possible_map(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7a40e068302d..2bbd27f89802 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -33,7 +33,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; EXPORT_PER_CPU_SYMBOL(this_cpu_off); -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { +unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = { [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, }; EXPORT_SYMBOL(__per_cpu_offset); @@ -246,7 +246,7 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + - IRQ_STACK_SIZE - 64; + IRQ_STACK_SIZE; #endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 04cb3212db2d..763af1d0de64 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -42,6 +42,7 @@ #include <asm/syscalls.h> #include <asm/sigframe.h> +#include <asm/signal.h> #define COPY(x) do { \ get_user_ex(regs->x, &sc->x); \ @@ -547,7 +548,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, return -EFAULT; if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (copy_siginfo_to_user32(&frame->info, &ksig->info)) + if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true)) return -EFAULT; } @@ -660,20 +661,21 @@ badframe: return 0; } -static inline int is_ia32_compat_frame(void) +static inline int is_ia32_compat_frame(struct ksignal *ksig) { return IS_ENABLED(CONFIG_IA32_EMULATION) && - test_thread_flag(TIF_IA32); + ksig->ka.sa.sa_flags & SA_IA32_ABI; } -static inline int is_ia32_frame(void) +static inline int is_ia32_frame(struct ksignal *ksig) { - return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(); + return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig); } -static inline int is_x32_frame(void) +static inline int is_x32_frame(struct ksignal *ksig) { - return IS_ENABLED(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); + return IS_ENABLED(CONFIG_X86_X32_ABI) && + ksig->ka.sa.sa_flags & SA_X32_ABI; } static int @@ -684,12 +686,12 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) compat_sigset_t *cset = (compat_sigset_t *) set; /* Set up the stack frame */ - if (is_ia32_frame()) { + if (is_ia32_frame(ksig)) { if (ksig->ka.sa.sa_flags & SA_SIGINFO) return ia32_setup_rt_frame(usig, ksig, cset, regs); else return ia32_setup_frame(usig, ksig, cset, regs); - } else if (is_x32_frame()) { + } else if (is_x32_frame(ksig)) { return x32_setup_rt_frame(ksig, cset, regs); } else { return __setup_rt_frame(ksig->sig, ksig, set, regs); @@ -783,7 +785,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * than the tracee. */ #ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED)) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index b44564bf86a8..40df33753bae 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -1,5 +1,6 @@ #include <linux/compat.h> #include <linux/uaccess.h> +#include <linux/ptrace.h> /* * The compat_siginfo_t structure and handing code is very easy @@ -92,10 +93,31 @@ static inline void signal_compat_build_tests(void) /* any new si_fields should be added here */ } -int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) +void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) +{ + /* Don't leak in-kernel non-uapi flags to user-space */ + if (oact) + oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); + + if (!act) + return; + + /* Don't let flags to be set from userspace */ + act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); + + if (user_64bit_mode(current_pt_regs())) + return; + + if (in_ia32_syscall()) + act->sa.sa_flags |= SA_IA32_ABI; + if (in_x32_syscall()) + act->sa.sa_flags |= SA_X32_ABI; +} + +int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, + bool x32_ABI) { int err = 0; - bool ia32 = test_thread_flag(TIF_IA32); signal_compat_build_tests(); @@ -146,7 +168,7 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) put_user_ex(from->si_arch, &to->si_arch); break; case __SI_CHLD >> 16: - if (ia32) { + if (!x32_ABI) { put_user_ex(from->si_utime, &to->si_utime); put_user_ex(from->si_stime, &to->si_stime); } else { @@ -180,6 +202,12 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) return err; } +/* from syscall's path, where we know the ABI */ +int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) +{ + return __copy_siginfo_to_user32(to, from, in_x32_syscall()); +} + int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) { int err = 0; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 4296beb8fdd3..42a93621f5b0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -471,7 +471,7 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } -static struct sched_domain_topology_level numa_inside_package_topology[] = { +static struct sched_domain_topology_level x86_numa_in_package_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, #endif @@ -480,22 +480,23 @@ static struct sched_domain_topology_level numa_inside_package_topology[] = { #endif { NULL, }, }; + +static struct sched_domain_topology_level x86_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + /* - * set_sched_topology() sets the topology internal to a CPU. The - * NUMA topologies are layered on top of it to build the full - * system topology. - * - * If NUMA nodes are observed to occur within a CPU package, this - * function should be called. It forces the sched domain code to - * only use the SMT level for the CPU portion of the topology. - * This essentially falls back to relying on NUMA information - * from the SRAT table to describe the entire system topology - * (except for hyperthreads). + * Set if a package/die has multiple NUMA nodes inside. + * AMD Magny-Cours and Intel Cluster-on-Die have this. */ -static void primarily_use_numa_for_topology(void) -{ - set_sched_topology(numa_inside_package_topology); -} +static bool x86_has_numa_in_package; void set_cpu_sibling_map(int cpu) { @@ -558,7 +559,7 @@ void set_cpu_sibling_map(int cpu) c->booted_cores = cpu_data(i).booted_cores; } if (match_die(c, o) && !topology_same_node(c, o)) - primarily_use_numa_for_topology(); + x86_has_numa_in_package = true; } threads = cpumask_weight(topology_sibling_cpumask(cpu)); @@ -690,7 +691,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); @@ -717,7 +718,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Be paranoid about clearing APIC errors. */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); @@ -756,7 +757,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) * Determine this based on the APIC version. * If we don't have an integrated APIC, don't send the STARTUP IPIs. */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) + if (APIC_INTEGRATED(boot_cpu_apic_version)) num_starts = 2; else num_starts = 0; @@ -942,7 +943,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) per_cpu(cpu_current_top_of_stack, cpu) = (unsigned long)task_stack_page(idle) + THREAD_SIZE; #else - clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); #endif } @@ -969,7 +969,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; - stack_start = idle->thread.sp; + initial_stack = idle->thread.sp; /* * Enable the espfix hack for this CPU @@ -994,7 +994,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) /* * Be paranoid about clearing APIC errors. */ - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } @@ -1115,17 +1115,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) common_cpu_up(cpu, tidle); - /* - * We have to walk the irq descriptors to setup the vector - * space for the cpu which comes online. Prevent irq - * alloc/free across the bringup. - */ - irq_lock_sparse(); - err = do_boot_cpu(apicid, cpu, tidle); - if (err) { - irq_unlock_sparse(); pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); return -EIO; } @@ -1143,8 +1134,6 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) touch_nmi_watchdog(); } - irq_unlock_sparse(); - return 0; } @@ -1249,7 +1238,7 @@ static int __init smp_sanity_check(unsigned max_cpus) /* * If we couldn't find a local APIC, then get out of here now! */ - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && + if (APIC_INTEGRATED(boot_cpu_apic_version) && !boot_cpu_has(X86_FEATURE_APIC)) { if (!disable_apic) { pr_err("BIOS bug, local APIC #%d not detected!...\n", @@ -1304,6 +1293,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); } + + /* + * Set 'default' x86 topology, this matches default_topology() in that + * it has NUMA nodes as a topology level. See also + * native_smp_cpus_done(). + * + * Must be done before set_cpus_sibling_map() is ran. + */ + set_sched_topology(x86_topology); + set_cpu_sibling_map(0); switch (smp_sanity_check(max_cpus)) { @@ -1323,14 +1322,13 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) break; } - default_setup_apic_routing(); - if (read_apic_id() != boot_cpu_physical_apicid) { panic("Boot APIC ID in local APIC unexpected (%d vs %d)", read_apic_id(), boot_cpu_physical_apicid); /* Or can we switch back to PIC here? */ } + default_setup_apic_routing(); cpu0_logical_apicid = apic_bsp_setup(false); pr_info("CPU%d: ", 0); @@ -1370,6 +1368,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus) { pr_debug("Boot done\n"); + if (x86_has_numa_in_package) + set_sched_topology(x86_numa_in_package_topology); + nmi_selftest(); impress_friends(); setup_ioapic_dest(); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 4738f5e0f2ab..0653788026e2 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -8,80 +8,69 @@ #include <linux/export.h> #include <linux/uaccess.h> #include <asm/stacktrace.h> +#include <asm/unwind.h> -static int save_stack_stack(void *data, char *name) +static int save_stack_address(struct stack_trace *trace, unsigned long addr, + bool nosched) { - return 0; -} - -static int -__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) -{ - struct stack_trace *trace = data; -#ifdef CONFIG_FRAME_POINTER - if (!reliable) - return 0; -#endif if (nosched && in_sched_functions(addr)) return 0; + if (trace->skip > 0) { trace->skip--; return 0; } - if (trace->nr_entries < trace->max_entries) { - trace->entries[trace->nr_entries++] = addr; - return 0; - } else { - return -1; /* no more room, stop walking the stack */ - } -} -static int save_stack_address(void *data, unsigned long addr, int reliable) -{ - return __save_stack_address(data, addr, reliable, false); + if (trace->nr_entries >= trace->max_entries) + return -1; + + trace->entries[trace->nr_entries++] = addr; + return 0; } -static int -save_stack_address_nosched(void *data, unsigned long addr, int reliable) +static void __save_stack_trace(struct stack_trace *trace, + struct task_struct *task, struct pt_regs *regs, + bool nosched) { - return __save_stack_address(data, addr, reliable, true); -} + struct unwind_state state; + unsigned long addr; -static const struct stacktrace_ops save_stack_ops = { - .stack = save_stack_stack, - .address = save_stack_address, - .walk_stack = print_context_stack, -}; + if (regs) + save_stack_address(trace, regs->ip, nosched); -static const struct stacktrace_ops save_stack_ops_nosched = { - .stack = save_stack_stack, - .address = save_stack_address_nosched, - .walk_stack = print_context_stack, -}; + for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr || save_stack_address(trace, addr, nosched)) + break; + } + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} /* * Save stack-backtrace addresses into a stack_trace buffer. */ void save_stack_trace(struct stack_trace *trace) { - dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + __save_stack_trace(trace, current, NULL, false); } EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) { - dump_trace(current, regs, NULL, 0, &save_stack_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + __save_stack_trace(trace, current, regs, false); } void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + if (!try_get_task_stack(tsk)) + return; + + __save_stack_trace(trace, tsk, NULL, true); + + put_task_stack(tsk); } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 654f6c66fe45..8402907825b0 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -188,12 +188,12 @@ static int tboot_setup_sleep(void) tboot->num_mac_regions = 0; - for (i = 0; i < e820.nr_map; i++) { - if ((e820.map[i].type != E820_RAM) - && (e820.map[i].type != E820_RESERVED_KERN)) + for (i = 0; i < e820->nr_map; i++) { + if ((e820->map[i].type != E820_RAM) + && (e820->map[i].type != E820_RESERVED_KERN)) continue; - add_mac_region(e820.map[i].addr, e820.map[i].size); + add_mac_region(e820->map[i].addr, e820->map[i].size); } tboot->acpi_sinfo.kernel_s3_resume_vector = diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b70ca12dd389..bd4e3d4d3625 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) +#ifdef CONFIG_VMAP_STACK +__visible void __noreturn handle_stack_overflow(const char *message, + struct pt_regs *regs, + unsigned long fault_address) +{ + printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n", + (void *)fault_address, current->stack, + (char *)current->stack + THREAD_SIZE - 1); + die(message, regs, 0); + + /* Be absolutely certain we don't return. */ + panic(message); +} +#endif + #ifdef CONFIG_X86_64 /* Runs on IST stack */ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) { static const char str[] = "double fault"; struct task_struct *tsk = current; +#ifdef CONFIG_VMAP_STACK + unsigned long cr2; +#endif #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; @@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_DF; +#ifdef CONFIG_VMAP_STACK + /* + * If we overflow the stack into a guard page, the CPU will fail + * to deliver #PF and will send #DF instead. Similarly, if we + * take any non-IST exception while too close to the bottom of + * the stack, the processor will get a page fault while + * delivering the exception and will generate a double fault. + * + * According to the SDM (footnote in 6.15 under "Interrupt 14 - + * Page-Fault Exception (#PF): + * + * Processors update CR2 whenever a page fault is detected. If a + * second page fault occurs while an earlier page fault is being + * deliv- ered, the faulting linear address of the second fault will + * overwrite the contents of CR2 (replacing the previous + * address). These updates to CR2 occur even if the page fault + * results in a double fault or occurs during the delivery of a + * double fault. + * + * The logic below has a small possibility of incorrectly diagnosing + * some errors as stack overflows. For example, if the IDT or GDT + * gets corrupted such that #GP delivery fails due to a bad descriptor + * causing #GP and we hit this condition while CR2 coincidentally + * points to the stack guard page, we'll think we overflowed the + * stack. Given that we're going to panic one way or another + * if this happens, this isn't necessarily worth fixing. + * + * If necessary, we could improve the test by only diagnosing + * a stack overflow if the saved RSP points within 47 bytes of + * the bottom of the stack: if RSP == tsk_stack + 48 and we + * take an exception, the stack is already aligned and there + * will be enough room SS, RSP, RFLAGS, CS, RIP, and a + * possible error code, so a stack overflow would *not* double + * fault. With any less space left, exception delivery could + * fail, and, as a practical matter, we've overflowed the + * stack even if the actual trigger for the double fault was + * something else. + */ + cr2 = read_cr2(); + if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE) + handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); +#endif + #ifdef CONFIG_DOUBLEFAULT df_debug(regs, error_code); #endif diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 78b9cb5a26af..46b2f41f8b05 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -23,6 +23,7 @@ #include <asm/x86_init.h> #include <asm/geode.h> #include <asm/apic.h> +#include <asm/intel-family.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -686,11 +687,16 @@ unsigned long native_calibrate_tsc(void) if (crystal_khz == 0) { switch (boot_cpu_data.x86_model) { - case 0x4E: /* SKL */ - case 0x5E: /* SKL */ + case INTEL_FAM6_SKYLAKE_MOBILE: + case INTEL_FAM6_SKYLAKE_DESKTOP: + case INTEL_FAM6_KABYLAKE_MOBILE: + case INTEL_FAM6_KABYLAKE_DESKTOP: crystal_khz = 24000; /* 24.0 MHz */ break; - case 0x5C: /* BXT */ + case INTEL_FAM6_SKYLAKE_X: + crystal_khz = 25000; /* 25.0 MHz */ + break; + case INTEL_FAM6_ATOM_GOLDMONT: crystal_khz = 19200; /* 19.2 MHz */ break; } diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c new file mode 100644 index 000000000000..a2456d4d286a --- /dev/null +++ b/arch/x86/kernel/unwind_frame.c @@ -0,0 +1,93 @@ +#include <linux/sched.h> +#include <asm/ptrace.h> +#include <asm/bitops.h> +#include <asm/stacktrace.h> +#include <asm/unwind.h> + +#define FRAME_HEADER_SIZE (sizeof(long) * 2) + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + unsigned long addr; + unsigned long *addr_p = unwind_get_return_address_ptr(state); + + if (unwind_done(state)) + return 0; + + addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p, + addr_p); + + return __kernel_text_address(addr) ? addr : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +static bool update_stack_state(struct unwind_state *state, void *addr, + size_t len) +{ + struct stack_info *info = &state->stack_info; + + /* + * If addr isn't on the current stack, switch to the next one. + * + * We may have to traverse multiple stacks to deal with the possibility + * that 'info->next_sp' could point to an empty stack and 'addr' could + * be on a subsequent stack. + */ + while (!on_stack(info, addr, len)) + if (get_stack_info(info->next_sp, state->task, info, + &state->stack_mask)) + return false; + + return true; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + unsigned long *next_bp; + + if (unwind_done(state)) + return false; + + next_bp = (unsigned long *)*state->bp; + + /* make sure the next frame's data is accessible */ + if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE)) + return false; + + /* move to the next frame */ + state->bp = next_bp; + return true; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + state->task = task; + + /* don't even attempt to start from user mode regs */ + if (regs && user_mode(regs)) { + state->stack_info.type = STACK_TYPE_UNKNOWN; + return; + } + + /* set up the starting stack frame */ + state->bp = get_frame_pointer(task, regs); + + /* initialize stack info and make sure the frame data is accessible */ + get_stack_info(state->bp, state->task, &state->stack_info, + &state->stack_mask); + update_stack_state(state, state->bp, FRAME_HEADER_SIZE); + + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + while (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + state->bp < first_frame)) + unwind_next_frame(state); +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c new file mode 100644 index 000000000000..b5a834c93065 --- /dev/null +++ b/arch/x86/kernel/unwind_guess.c @@ -0,0 +1,43 @@ +#include <linux/sched.h> +#include <linux/ftrace.h> +#include <asm/ptrace.h> +#include <asm/bitops.h> +#include <asm/stacktrace.h> +#include <asm/unwind.h> + +bool unwind_next_frame(struct unwind_state *state) +{ + struct stack_info *info = &state->stack_info; + + if (unwind_done(state)) + return false; + + do { + for (state->sp++; state->sp < info->end; state->sp++) + if (__kernel_text_address(*state->sp)) + return true; + + state->sp = info->next_sp; + + } while (!get_stack_info(state->sp, state->task, info, + &state->stack_mask)); + + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + + state->task = task; + state->sp = first_frame; + + get_stack_info(first_frame, state->task, &state->stack_info, + &state->stack_mask); + + if (!__kernel_text_address(*first_frame)) + unwind_next_frame(state); +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 95e49f6e4fc3..b2cee3d19477 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -38,7 +38,7 @@ EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); -EXPORT_SYMBOL_GPL(memcpy_mcsafe); +EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 76c5e52436c4..0bd9f1287f39 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -91,7 +91,7 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; static int default_i8042_detect(void) { return 1; }; -struct x86_platform_ops x86_platform = { +struct x86_platform_ops x86_platform __ro_after_init = { .calibrate_cpu = native_calibrate_cpu, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, @@ -108,7 +108,7 @@ struct x86_platform_ops x86_platform = { EXPORT_SYMBOL_GPL(x86_platform); #if defined(CONFIG_PCI_MSI) -struct x86_msi_ops x86_msi = { +struct x86_msi_ops x86_msi __ro_after_init = { .setup_msi_irqs = native_setup_msi_irqs, .teardown_msi_irq = native_teardown_msi_irq, .teardown_msi_irqs = default_teardown_msi_irqs, @@ -137,7 +137,7 @@ void arch_restore_msi_irqs(struct pci_dev *dev) } #endif -struct x86_io_apic_ops x86_io_apic_ops = { +struct x86_io_apic_ops x86_io_apic_ops __ro_after_init = { .read = native_io_apic_read, .disable = native_disable_io_apic, }; |