diff options
Diffstat (limited to 'arch/x86/kernel')
89 files changed, 2867 insertions, 1784 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 84c00592d359..4b994232cb57 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -27,7 +27,7 @@ KASAN_SANITIZE_stacktrace.o := n OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y -OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y # If instrumentation of this dir is enabled, boot hangs during first second. @@ -46,7 +46,7 @@ obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o -obj-$(CONFIG_X86_64) += sys_x86_64.o mcount_64.o +obj-$(CONFIG_X86_64) += sys_x86_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o @@ -82,6 +82,7 @@ obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_LIVEPATCH) += livepatch.o +obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index ae32838cac5f..6bb680671088 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -37,6 +37,7 @@ #include <linux/pci.h> #include <linux/efi-bgrt.h> +#include <asm/e820/api.h> #include <asm/irqdomain.h> #include <asm/pci_x86.h> #include <asm/pgtable.h> @@ -179,10 +180,15 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return -EINVAL; } + if (!enabled) { + ++disabled_cpus; + return -EINVAL; + } + if (boot_cpu_physical_apicid != -1U) ver = boot_cpu_apic_version; - cpu = __generic_processor_info(id, ver, enabled); + cpu = generic_processor_info(id, ver); if (cpu >= 0) early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; @@ -710,7 +716,7 @@ static void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include <acpi/processor.h> -int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; @@ -1559,12 +1565,6 @@ int __init early_acpi_boot_init(void) return 0; } -static int __init acpi_parse_bgrt(struct acpi_table_header *table) -{ - efi_bgrt_init(table); - return 0; -} - int __init acpi_boot_init(void) { /* those are executed after early-quirks are executed */ @@ -1724,6 +1724,6 @@ int __acpi_release_global_lock(unsigned int *lock) void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) { - e820_add_region(addr, size, E820_ACPI); - update_e820(); + e820__range_add(addr, size, E820_TYPE_ACPI); + e820__update_table_print(); } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 48587335ede8..ed014814ea35 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -101,7 +101,7 @@ int x86_acpi_suspend_lowlevel(void) #ifdef CONFIG_SMP initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = - (unsigned long)get_cpu_gdt_table(smp_processor_id()); + (unsigned long)get_cpu_gdt_rw(smp_processor_id()); initial_gs = per_cpu_offset(smp_processor_id()); #endif initial_code = (unsigned long)wakeup_long64; diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index df083efe6ee0..815dd63f49d0 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -36,7 +36,7 @@ #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/swiotlb.h> #include <asm/dma.h> #include <asm/amd_nb.h> diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 0a2bb1f62e72..ef2859f9fcce 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -21,7 +21,7 @@ #include <linux/pci.h> #include <linux/bitops.h> #include <linux/suspend.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/io.h> #include <asm/iommu.h> #include <asm/gart.h> @@ -306,13 +306,13 @@ void __init early_gart_iommu_check(void) fix = 1; if (gart_fix_e820 && !fix && aper_enabled) { - if (e820_any_mapped(aper_base, aper_base + aper_size, - E820_RAM)) { + if (e820__mapped_any(aper_base, aper_base + aper_size, + E820_TYPE_RAM)) { /* reserve it, so we can reuse it in second kernel */ pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n", aper_base, aper_base + aper_size - 1); - e820_add_region(aper_base, aper_size, E820_RESERVED); - update_e820(); + e820__range_add(aper_base, aper_size, E820_TYPE_RESERVED); + e820__update_table_print(); } } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4261b3282ad9..2d75faf743f2 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -731,8 +731,10 @@ static int __init calibrate_APIC_clock(void) TICK_NSEC, lapic_clockevent.shift); lapic_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFF; lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; return 0; } @@ -778,8 +780,10 @@ static int __init calibrate_APIC_clock(void) lapic_clockevent.shift); lapic_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFFFF; lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; @@ -1610,24 +1614,15 @@ static inline void try_to_enable_x2apic(int remap_mode) { } static inline void __x2apic_enable(void) { } #endif /* !CONFIG_X86_X2APIC */ -static int __init try_to_enable_IR(void) -{ -#ifdef CONFIG_X86_IO_APIC - if (!x2apic_enabled() && skip_ioapic_setup) { - pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n"); - return -1; - } -#endif - return irq_remapping_enable(); -} - void __init enable_IR_x2apic(void) { unsigned long flags; int ret, ir_stat; - if (skip_ioapic_setup) + if (skip_ioapic_setup) { + pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n"); return; + } ir_stat = irq_remapping_prepare(); if (ir_stat < 0 && !x2apic_supported()) @@ -1645,7 +1640,7 @@ void __init enable_IR_x2apic(void) /* If irq_remapping_prepare() succeeded, try to enable it */ if (ir_stat >= 0) - ir_stat = try_to_enable_IR(); + ir_stat = irq_remapping_enable(); /* ir_stat contains the remap mode or an error code */ try_to_enable_x2apic(ir_stat); @@ -1798,8 +1793,8 @@ void __init init_apic_mappings(void) apic_phys = mp_lapic_addr; /* - * acpi lapic path already maps that address in - * acpi_register_lapic_address() + * If the system has ACPI MADT tables or MP info, the LAPIC + * address is already registered. */ if (!acpi_lapic && !smp_found_config) register_lapic_address(apic_phys); @@ -2062,17 +2057,17 @@ static int allocate_logical_cpuid(int apicid) /* Allocate a new cpuid. */ if (nr_logical_cpuids >= nr_cpu_ids) { - WARN_ONCE(1, "Only %d processors supported." + WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %i reached. " "Processor %d/0x%x and the rest are ignored.\n", - nr_cpu_ids - 1, nr_logical_cpuids, apicid); - return -1; + nr_cpu_ids, nr_logical_cpuids, apicid); + return -EINVAL; } cpuid_to_apicid[nr_logical_cpuids] = apicid; return nr_logical_cpuids++; } -int __generic_processor_info(int apicid, int version, bool enabled) +int generic_processor_info(int apicid, int version) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2130,11 +2125,9 @@ int __generic_processor_info(int apicid, int version, bool enabled) if (num_processors >= nr_cpu_ids) { int thiscpu = max + disabled_cpus; - if (enabled) { - pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " - "reached. Processor %d/0x%x ignored.\n", - max, thiscpu, apicid); - } + pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " + "reached. Processor %d/0x%x ignored.\n", + max, thiscpu, apicid); disabled_cpus++; return -EINVAL; @@ -2186,23 +2179,13 @@ int __generic_processor_info(int apicid, int version, bool enabled) apic->x86_32_early_logical_apicid(cpu); #endif set_cpu_possible(cpu, true); - - if (enabled) { - num_processors++; - physid_set(apicid, phys_cpu_present_map); - set_cpu_present(cpu, true); - } else { - disabled_cpus++; - } + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + num_processors++; return cpu; } -int generic_processor_info(int apicid, int version) -{ - return __generic_processor_info(apicid, version, true); -} - int hard_smp_processor_id(void) { return read_apic_id(); @@ -2258,7 +2241,7 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) static void __init apic_bsp_up_setup(void) { #ifdef CONFIG_X86_64 - apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); + apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid)); #else /* * Hack: In case of kdump, after a crash, kernel might be booting @@ -2648,7 +2631,7 @@ static int __init lapic_insert_resource(void) } /* - * need call insert after e820_reserve_resources() + * need call insert after e820__reserve_resources() * that is using request_resource */ late_initcall(lapic_insert_resource); diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index b109e4389c92..2262eb6df796 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -26,7 +26,7 @@ #include <linux/interrupt.h> #include <asm/acpi.h> -#include <asm/e820.h> +#include <asm/e820/api.h> static void noop_init_apic_ldr(void) { } static void noop_send_IPI(int cpu, int vector) { } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index c48264e202fd..2e8f7f048f4f 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -25,7 +25,7 @@ #include <linux/interrupt.h> #include <asm/acpi.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index e9f8f8cdd570..b487b3a01615 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -34,6 +34,7 @@ #include <asm/uv/bios.h> #include <asm/uv/uv.h> #include <asm/apic.h> +#include <asm/e820/api.h> #include <asm/ipi.h> #include <asm/smp.h> #include <asm/x86_init.h> @@ -1105,7 +1106,8 @@ void __init uv_init_hub_info(struct uv_hub_info_s *hi) node_id.v = uv_read_local_mmr(UVH_NODE_ID); uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val); hi->gnode_extra = (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1; - hi->gnode_upper = (unsigned long)hi->gnode_extra << mn.m_val; + if (mn.m_val) + hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val; if (uv_gp_table) { hi->global_mmr_base = uv_gp_table->mmr_base; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5a414545e8a3..446b0d3d4932 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -609,7 +609,7 @@ static long __apm_bios_call(void *_call) cpu = get_cpu(); BUG_ON(cpu != 0); - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); save_desc_40 = gdt[0x40 / 8]; gdt[0x40 / 8] = bad_bios_desc; @@ -685,7 +685,7 @@ static long __apm_bios_call_simple(void *_call) cpu = get_cpu(); BUG_ON(cpu != 0); - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); save_desc_40 = gdt[0x40 / 8]; gdt[0x40 / 8] = bad_bios_desc; @@ -2352,7 +2352,7 @@ static int __init apm_init(void) * Note we only set APM segments on CPU zero, since we pin the APM * code to that CPU. */ - gdt = get_cpu_gdt_table(0); + gdt = get_cpu_gdt_rw(0); set_desc_base(&gdt[APM_CS >> 3], (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); set_desc_base(&gdt[APM_CS_16 >> 3], diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 35a5d5dca2fa..ee8f11800295 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -16,7 +16,7 @@ #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> -# include <asm/cacheflush.h> +# include <asm/set_memory.h> #endif #include "cpu.h" @@ -556,10 +556,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (check_tsc_unstable()) - clear_sched_clock_stable(); - } else { - clear_sched_clock_stable(); } /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a44ef52184df..0af86d9242da 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -17,7 +17,7 @@ #include <asm/paravirt.h> #include <asm/alternative.h> #include <asm/pgtable.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> void __init check_bugs(void) { diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index adc0ebd8bed0..44207b71fee1 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -3,7 +3,7 @@ #include <linux/sched/clock.h> #include <asm/cpufeature.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/msr.h> @@ -105,8 +105,6 @@ static void early_init_centaur(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #endif - - clear_sched_clock_stable(); } static void init_centaur(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b11b38c3b0bd..c8b39870f33e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -88,7 +88,6 @@ static void default_init(struct cpuinfo_x86 *c) strcpy(c->x86_model_id, "386"); } #endif - clear_sched_clock_stable(); } static const struct cpu_dev default_cpu = { @@ -449,19 +448,60 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } +/* Setup the fixmap mapping only once per-processor */ +static inline void setup_fixmap_gdt(int cpu) +{ +#ifdef CONFIG_X86_64 + /* On 64-bit systems, we use a read-only fixmap GDT. */ + pgprot_t prot = PAGE_KERNEL_RO; +#else + /* + * On native 32-bit systems, the GDT cannot be read-only because + * our double fault handler uses a task gate, and entering through + * a task gate needs to change an available TSS to busy. If the GDT + * is read-only, that will triple fault. + * + * On Xen PV, the GDT must be read-only because the hypervisor requires + * it. + */ + pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? + PAGE_KERNEL_RO : PAGE_KERNEL; +#endif + + __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); +} + +/* Load the original GDT from the per-cpu structure */ +void load_direct_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_rw(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} +EXPORT_SYMBOL_GPL(load_direct_gdt); + +/* Load a fixmap remapping of the per-cpu GDT */ +void load_fixmap_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_ro(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} +EXPORT_SYMBOL_GPL(load_fixmap_gdt); + /* * Current gdt points %fs at the "master" per-cpu area: after this, * it's on the real one. */ void switch_to_new_gdt(int cpu) { - struct desc_ptr gdt_descr; - - gdt_descr.address = (long)get_cpu_gdt_table(cpu); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); + /* Load the original GDT */ + load_direct_gdt(cpu); /* Reload the per-cpu base */ - load_percpu_segment(cpu); } @@ -1077,8 +1117,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) */ if (this_cpu->c_init) this_cpu->c_init(c); - else - clear_sched_clock_stable(); /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); @@ -1111,7 +1149,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) detect_ht(c); #endif - init_hypervisor(c); x86_init_rdrand(c); x86_init_cache_qos(c); setup_pku(c); @@ -1529,6 +1566,9 @@ void cpu_init(void) if (is_uv_system()) uv_cpu_init(); + + setup_fixmap_gdt(cpu); + load_fixmap_gdt(cpu); } #else @@ -1584,6 +1624,9 @@ void cpu_init(void) dbg_restore_debug_regs(); fpu__init_cpu(); + + setup_fixmap_gdt(cpu); + load_fixmap_gdt(cpu); } #endif diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 0a3bc19de017..a70fd61095f8 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -185,7 +185,6 @@ static void early_init_cyrix(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); break; } - clear_sched_clock_stable(); } static void init_cyrix(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 35691a6b0d32..4fa90006ac68 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -28,8 +28,11 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { -#ifdef CONFIG_XEN - &x86_hyper_xen, +#ifdef CONFIG_XEN_PV + &x86_hyper_xen_pv, +#endif +#ifdef CONFIG_XEN_PVHVM + &x86_hyper_xen_hvm, #endif &x86_hyper_vmware, &x86_hyper_ms_hyperv, @@ -60,12 +63,6 @@ detect_hypervisor_vendor(void) pr_info("Hypervisor detected: %s\n", x86_hyper->name); } -void init_hypervisor(struct cpuinfo_x86 *c) -{ - if (x86_hyper && x86_hyper->set_cpu_features) - x86_hyper->set_cpu_features(c); -} - void __init init_hypervisor_platform(void) { @@ -74,8 +71,6 @@ void __init init_hypervisor_platform(void) if (!x86_hyper) return; - init_hypervisor(&boot_cpu_data); - if (x86_hyper->init_platform) x86_hyper->init_platform(); } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fe0a615a051b..dfa90a3a5145 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -90,16 +90,12 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) return; } - if (ring3mwait_disabled) { - msr_clear_bit(MSR_MISC_FEATURE_ENABLES, - MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT); + if (ring3mwait_disabled) return; - } - - msr_set_bit(MSR_MISC_FEATURE_ENABLES, - MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT); set_cpu_cap(c, X86_FEATURE_RING3MWAIT); + this_cpu_or(msr_misc_features_shadow, + 1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT); if (c == &boot_cpu_data) ELF_HWCAP2 |= HWCAP2_RING3MWAIT; @@ -162,10 +158,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (check_tsc_unstable()) - clear_sched_clock_stable(); - } else { - clear_sched_clock_stable(); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ @@ -492,6 +484,34 @@ static void intel_bsp_resume(struct cpuinfo_x86 *c) init_intel_energy_perf(c); } +static void init_cpuid_fault(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) { + if (msr & MSR_PLATFORM_INFO_CPUID_FAULT) + set_cpu_cap(c, X86_FEATURE_CPUID_FAULT); + } +} + +static void init_intel_misc_features(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr)) + return; + + /* Clear all MISC features */ + this_cpu_write(msr_misc_features_shadow, 0); + + /* Check features and update capabilities and shadow control bits */ + init_cpuid_fault(c); + probe_xeon_phi_r3mwait(c); + + msr = this_cpu_read(msr_misc_features_shadow); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); +} + static void init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; @@ -606,7 +626,7 @@ static void init_intel(struct cpuinfo_x86 *c) init_intel_energy_perf(c); - probe_xeon_phi_r3mwait(c); + init_intel_misc_features(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 5a533fefefa0..5b366462f579 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -32,55 +32,98 @@ #include <asm/intel-family.h> #include <asm/intel_rdt.h> +#define MAX_MBA_BW 100u +#define MBA_IS_LINEAR 0x4 + /* Mutex to protect rdtgroup access. */ DEFINE_MUTEX(rdtgroup_mutex); DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); +/* + * Used to store the max resource name width and max resource data width + * to display the schemata in a tabular format + */ +int max_name_width, max_data_width; + +static void +mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); +static void +cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); + #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) struct rdt_resource rdt_resources_all[] = { { - .name = "L3", - .domains = domain_init(RDT_RESOURCE_L3), - .msr_base = IA32_L3_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 3, - .cbm_idx_multi = 1, - .cbm_idx_offset = 0 + .name = "L3", + .domains = domain_init(RDT_RESOURCE_L3), + .msr_base = IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 1, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + }, + { + .name = "L3DATA", + .domains = domain_init(RDT_RESOURCE_L3DATA), + .msr_base = IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", }, { - .name = "L3DATA", - .domains = domain_init(RDT_RESOURCE_L3DATA), - .msr_base = IA32_L3_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 3, - .cbm_idx_multi = 2, - .cbm_idx_offset = 0 + .name = "L3CODE", + .domains = domain_init(RDT_RESOURCE_L3CODE), + .msr_base = IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 1, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", }, { - .name = "L3CODE", - .domains = domain_init(RDT_RESOURCE_L3CODE), - .msr_base = IA32_L3_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 3, - .cbm_idx_multi = 2, - .cbm_idx_offset = 1 + .name = "L2", + .domains = domain_init(RDT_RESOURCE_L2), + .msr_base = IA32_L2_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 2, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 1, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", }, { - .name = "L2", - .domains = domain_init(RDT_RESOURCE_L2), - .msr_base = IA32_L2_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 2, - .cbm_idx_multi = 1, - .cbm_idx_offset = 0 + .name = "MB", + .domains = domain_init(RDT_RESOURCE_MBA), + .msr_base = IA32_MBA_THRTL_BASE, + .msr_update = mba_wrmsr, + .cache_level = 3, + .parse_ctrlval = parse_bw, + .format_str = "%d=%*d", }, }; -static int cbm_idx(struct rdt_resource *r, int closid) +static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) { - return closid * r->cbm_idx_multi + r->cbm_idx_offset; + return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset; } /* @@ -118,9 +161,9 @@ static inline bool cache_alloc_hsw_probe(void) return false; r->num_closid = 4; - r->cbm_len = 20; - r->max_cbm = max_cbm; - r->min_cbm_bits = 2; + r->default_ctrl = max_cbm; + r->cache.cbm_len = 20; + r->cache.min_cbm_bits = 2; r->capable = true; r->enabled = true; @@ -130,16 +173,66 @@ static inline bool cache_alloc_hsw_probe(void) return false; } -static void rdt_get_config(int idx, struct rdt_resource *r) +/* + * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values + * exposed to user interface and the h/w understandable delay values. + * + * The non-linear delay values have the granularity of power of two + * and also the h/w does not guarantee a curve for configured delay + * values vs. actual b/w enforced. + * Hence we need a mapping that is pre calibrated so the user can + * express the memory b/w as a percentage value. + */ +static inline bool rdt_get_mb_table(struct rdt_resource *r) +{ + /* + * There are no Intel SKUs as of now to support non-linear delay. + */ + pr_info("MBA b/w map not implemented for cpu:%d, model:%d", + boot_cpu_data.x86, boot_cpu_data.x86_model); + + return false; +} + +static bool rdt_get_mem_config(struct rdt_resource *r) +{ + union cpuid_0x10_3_eax eax; + union cpuid_0x10_x_edx edx; + u32 ebx, ecx; + + cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); + r->num_closid = edx.split.cos_max + 1; + r->membw.max_delay = eax.split.max_delay + 1; + r->default_ctrl = MAX_MBA_BW; + if (ecx & MBA_IS_LINEAR) { + r->membw.delay_linear = true; + r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay; + r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay; + } else { + if (!rdt_get_mb_table(r)) + return false; + } + r->data_width = 3; + rdt_get_mba_infofile(r); + + r->capable = true; + r->enabled = true; + + return true; +} + +static void rdt_get_cache_config(int idx, struct rdt_resource *r) { union cpuid_0x10_1_eax eax; - union cpuid_0x10_1_edx edx; + union cpuid_0x10_x_edx edx; u32 ebx, ecx; cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); r->num_closid = edx.split.cos_max + 1; - r->cbm_len = eax.split.cbm_len + 1; - r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.cbm_len = eax.split.cbm_len + 1; + r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->data_width = (r->cache.cbm_len + 3) / 4; + rdt_get_cache_infofile(r); r->capable = true; r->enabled = true; } @@ -150,8 +243,9 @@ static void rdt_get_cdp_l3_config(int type) struct rdt_resource *r = &rdt_resources_all[type]; r->num_closid = r_l3->num_closid / 2; - r->cbm_len = r_l3->cbm_len; - r->max_cbm = r_l3->max_cbm; + r->cache.cbm_len = r_l3->cache.cbm_len; + r->default_ctrl = r_l3->default_ctrl; + r->data_width = (r->cache.cbm_len + 3) / 4; r->capable = true; /* * By default, CDP is disabled. CDP can be enabled by mount parameter @@ -160,33 +254,6 @@ static void rdt_get_cdp_l3_config(int type) r->enabled = false; } -static inline bool get_rdt_resources(void) -{ - bool ret = false; - - if (cache_alloc_hsw_probe()) - return true; - - if (!boot_cpu_has(X86_FEATURE_RDT_A)) - return false; - - if (boot_cpu_has(X86_FEATURE_CAT_L3)) { - rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); - if (boot_cpu_has(X86_FEATURE_CDP_L3)) { - rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); - rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); - } - ret = true; - } - if (boot_cpu_has(X86_FEATURE_CAT_L2)) { - /* CPUID 0x10.2 fields are same format at 0x10.1 */ - rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); - ret = true; - } - - return ret; -} - static int get_cache_id(int cpu, int level) { struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); @@ -200,29 +267,55 @@ static int get_cache_id(int cpu, int level) return -1; } -void rdt_cbm_update(void *arg) +/* + * Map the memory b/w percentage value to delay values + * that can be written to QOS_MSRs. + * There are currently no SKUs which support non linear delay values. + */ +static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) { - struct msr_param *m = (struct msr_param *)arg; + if (r->membw.delay_linear) + return MAX_MBA_BW - bw; + + pr_warn_once("Non Linear delay-bw map not supported but queried\n"); + return r->default_ctrl; +} + +static void +mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +{ + unsigned int i; + + /* Write the delay values for mba. */ + for (i = m->low; i < m->high; i++) + wrmsrl(r->msr_base + i, delay_bw_map(d->ctrl_val[i], r)); +} + +static void +cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +{ + unsigned int i; + + for (i = m->low; i < m->high; i++) + wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); +} + +void rdt_ctrl_update(void *arg) +{ + struct msr_param *m = arg; struct rdt_resource *r = m->res; - int i, cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct rdt_domain *d; list_for_each_entry(d, &r->domains, list) { /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->cpu_mask)) - goto found; + if (cpumask_test_cpu(cpu, &d->cpu_mask)) { + r->msr_update(d, m, r); + return; + } } - pr_info_once("cpu %d not found in any domain for resource %s\n", + pr_warn_once("cpu %d not found in any domain for resource %s\n", cpu, r->name); - - return; - -found: - for (i = m->low; i < m->high; i++) { - int idx = cbm_idx(r, i); - - wrmsrl(r->msr_base + idx, d->cbm[i]); - } } /* @@ -258,6 +351,32 @@ static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, return NULL; } +static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) +{ + struct msr_param m; + u32 *dc; + int i; + + dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL); + if (!dc) + return -ENOMEM; + + d->ctrl_val = dc; + + /* + * Initialize the Control MSRs to having no control. + * For Cache Allocation: Set all bits in cbm + * For Memory Allocation: Set b/w requested to 100 + */ + for (i = 0; i < r->num_closid; i++, dc++) + *dc = r->default_ctrl; + + m.low = 0; + m.high = r->num_closid; + r->msr_update(d, &m, r); + return 0; +} + /* * domain_add_cpu - Add a cpu to a resource's domain list. * @@ -273,7 +392,7 @@ static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, */ static void domain_add_cpu(int cpu, struct rdt_resource *r) { - int i, id = get_cache_id(cpu, r->cache_level); + int id = get_cache_id(cpu, r->cache_level); struct list_head *add_pos = NULL; struct rdt_domain *d; @@ -294,22 +413,13 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) d->id = id; - d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL); - if (!d->cbm) { + if (domain_setup_ctrlval(r, d)) { kfree(d); return; } - for (i = 0; i < r->num_closid; i++) { - int idx = cbm_idx(r, i); - - d->cbm[i] = r->max_cbm; - wrmsrl(r->msr_base + idx, d->cbm[i]); - } - cpumask_set_cpu(cpu, &d->cpu_mask); list_add_tail(&d->list, add_pos); - r->num_domains++; } static void domain_remove_cpu(int cpu, struct rdt_resource *r) @@ -325,8 +435,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { - r->num_domains--; - kfree(d->cbm); + kfree(d->ctrl_val); list_del(&d->list); kfree(d); } @@ -374,6 +483,57 @@ static int intel_rdt_offline_cpu(unsigned int cpu) return 0; } +/* + * Choose a width for the resource name and resource data based on the + * resource that has widest name and cbm. + */ +static __init void rdt_init_padding(void) +{ + struct rdt_resource *r; + int cl; + + for_each_capable_rdt_resource(r) { + cl = strlen(r->name); + if (cl > max_name_width) + max_name_width = cl; + + if (r->data_width > max_data_width) + max_data_width = r->data_width; + } +} + +static __init bool get_rdt_resources(void) +{ + bool ret = false; + + if (cache_alloc_hsw_probe()) + return true; + + if (!boot_cpu_has(X86_FEATURE_RDT_A)) + return false; + + if (boot_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (boot_cpu_has(X86_FEATURE_CDP_L3)) { + rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); + rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); + } + ret = true; + } + if (boot_cpu_has(X86_FEATURE_CAT_L2)) { + /* CPUID 0x10.2 fields are same format at 0x10.1 */ + rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); + ret = true; + } + + if (boot_cpu_has(X86_FEATURE_MBA)) { + if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA])) + ret = true; + } + + return ret; +} + static int __init intel_rdt_late_init(void) { struct rdt_resource *r; @@ -382,6 +542,8 @@ static int __init intel_rdt_late_init(void) if (!get_rdt_resources()) return -ENODEV; + rdt_init_padding(); + state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rdt/cat:online:", intel_rdt_online_cpu, intel_rdt_offline_cpu); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 0bbe0f3a039f..f5af0cc7eb0d 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -28,7 +28,6 @@ #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/slab.h> -#include <linux/cpu.h> #include <linux/task_work.h> #include <uapi/linux/magic.h> @@ -175,6 +174,13 @@ static struct kernfs_ops rdtgroup_kf_single_ops = { .seq_show = rdtgroup_seqfile_show, }; +static bool is_cpu_list(struct kernfs_open_file *of) +{ + struct rftype *rft = of->kn->priv; + + return rft->flags & RFTYPE_FLAGS_CPUS_LIST; +} + static int rdtgroup_cpus_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { @@ -183,10 +189,12 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask)); - else + if (rdtgrp) { + seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", + cpumask_pr_args(&rdtgrp->cpu_mask)); + } else { ret = -ENOENT; + } rdtgroup_kn_unlock(of->kn); return ret; @@ -253,7 +261,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, goto unlock; } - ret = cpumask_parse(buf, newmask); + if (is_cpu_list(of)) + ret = cpulist_parse(buf, newmask); + else + ret = cpumask_parse(buf, newmask); + if (ret) goto unlock; @@ -474,6 +486,14 @@ static struct rftype rdtgroup_base_files[] = { .seq_show = rdtgroup_cpus_show, }, { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + }, + { .name = "tasks", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, @@ -495,32 +515,56 @@ static int rdt_num_closids_show(struct kernfs_open_file *of, struct rdt_resource *r = of->kn->parent->priv; seq_printf(seq, "%d\n", r->num_closid); + return 0; +} +static int rdt_default_ctrl_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->default_ctrl); return 0; } -static int rdt_cbm_mask_show(struct kernfs_open_file *of, +static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct rdt_resource *r = of->kn->parent->priv; - seq_printf(seq, "%x\n", r->max_cbm); + seq_printf(seq, "%u\n", r->cache.min_cbm_bits); + return 0; +} + +static int rdt_min_bw_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + seq_printf(seq, "%u\n", r->membw.min_bw); return 0; } -static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, +static int rdt_bw_gran_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct rdt_resource *r = of->kn->parent->priv; - seq_printf(seq, "%d\n", r->min_cbm_bits); + seq_printf(seq, "%u\n", r->membw.bw_gran); + return 0; +} +static int rdt_delay_linear_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%u\n", r->membw.delay_linear); return 0; } /* rdtgroup information files for one cache resource. */ -static struct rftype res_info_files[] = { +static struct rftype res_cache_info_files[] = { { .name = "num_closids", .mode = 0444, @@ -531,7 +575,7 @@ static struct rftype res_info_files[] = { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_cbm_mask_show, + .seq_show = rdt_default_ctrl_show, }, { .name = "min_cbm_bits", @@ -541,11 +585,52 @@ static struct rftype res_info_files[] = { }, }; +/* rdtgroup information files for memory bandwidth. */ +static struct rftype res_mba_info_files[] = { + { + .name = "num_closids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_closids_show, + }, + { + .name = "min_bandwidth", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_bw_show, + }, + { + .name = "bandwidth_gran", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bw_gran_show, + }, + { + .name = "delay_linear", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_delay_linear_show, + }, +}; + +void rdt_get_mba_infofile(struct rdt_resource *r) +{ + r->info_files = res_mba_info_files; + r->nr_info_files = ARRAY_SIZE(res_mba_info_files); +} + +void rdt_get_cache_infofile(struct rdt_resource *r) +{ + r->info_files = res_cache_info_files; + r->nr_info_files = ARRAY_SIZE(res_cache_info_files); +} + static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) { struct kernfs_node *kn_subdir; + struct rftype *res_info_files; struct rdt_resource *r; - int ret; + int ret, len; /* create the directory */ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); @@ -564,8 +649,11 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) ret = rdtgroup_kn_set_ugid(kn_subdir); if (ret) goto out_destroy; - ret = rdtgroup_add_files(kn_subdir, res_info_files, - ARRAY_SIZE(res_info_files)); + + res_info_files = r->info_files; + len = r->nr_info_files; + + ret = rdtgroup_add_files(kn_subdir, res_info_files, len); if (ret) goto out_destroy; kernfs_activate(kn_subdir); @@ -728,7 +816,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { kernfs_unbreak_active_protection(kn); - kernfs_put(kn); + kernfs_put(rdtgrp->kn); kfree(rdtgrp); } else { kernfs_unbreak_active_protection(kn); @@ -781,7 +869,7 @@ out: return dentry; } -static int reset_all_cbms(struct rdt_resource *r) +static int reset_all_ctrls(struct rdt_resource *r) { struct msr_param msr_param; cpumask_var_t cpu_mask; @@ -804,14 +892,14 @@ static int reset_all_cbms(struct rdt_resource *r) cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); for (i = 0; i < r->num_closid; i++) - d->cbm[i] = r->max_cbm; + d->ctrl_val[i] = r->default_ctrl; } cpu = get_cpu(); /* Update CBM on this cpu if it's in cpu_mask. */ if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_cbm_update(&msr_param); + rdt_ctrl_update(&msr_param); /* Update CBM on all other cpus in cpu_mask. */ - smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); put_cpu(); free_cpumask_var(cpu_mask); @@ -897,7 +985,7 @@ static void rdt_kill_sb(struct super_block *sb) /*Put everything back to default values. */ for_each_enabled_rdt_resource(r) - reset_all_cbms(r); + reset_all_ctrls(r); cdp_disable(); rmdir_all_sub(); static_branch_disable(&rdt_enable_key); diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c index f369cb8db0d5..406d7a6532f9 100644 --- a/arch/x86/kernel/cpu/intel_rdt_schemata.c +++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c @@ -29,26 +29,77 @@ #include <asm/intel_rdt.h> /* + * Check whether MBA bandwidth percentage value is correct. The value is + * checked against the minimum and max bandwidth values specified by the + * hardware. The allocated bandwidth percentage is rounded to the next + * control step available on the hardware. + */ +static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) +{ + unsigned long bw; + int ret; + + /* + * Only linear delay values is supported for current Intel SKUs. + */ + if (!r->membw.delay_linear) + return false; + + ret = kstrtoul(buf, 10, &bw); + if (ret) + return false; + + if (bw < r->membw.min_bw || bw > r->default_ctrl) + return false; + + *data = roundup(bw, (unsigned long)r->membw.bw_gran); + return true; +} + +int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d) +{ + unsigned long data; + + if (d->have_new_ctrl) + return -EINVAL; + + if (!bw_validate(buf, &data, r)) + return -EINVAL; + d->new_ctrl = data; + d->have_new_ctrl = true; + + return 0; +} + +/* * Check whether a cache bit mask is valid. The SDM says: * Please note that all (and only) contiguous '1' combinations * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). * Additionally Haswell requires at least two bits set. */ -static bool cbm_validate(unsigned long var, struct rdt_resource *r) +static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r) { - unsigned long first_bit, zero_bit; + unsigned long first_bit, zero_bit, val; + unsigned int cbm_len = r->cache.cbm_len; + int ret; + + ret = kstrtoul(buf, 16, &val); + if (ret) + return false; - if (var == 0 || var > r->max_cbm) + if (val == 0 || val > r->default_ctrl) return false; - first_bit = find_first_bit(&var, r->cbm_len); - zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit); + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len) + if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) return false; - if ((zero_bit - first_bit) < r->min_cbm_bits) + if ((zero_bit - first_bit) < r->cache.min_cbm_bits) return false; + + *data = val; return true; } @@ -56,17 +107,17 @@ static bool cbm_validate(unsigned long var, struct rdt_resource *r) * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ -static int parse_cbm(char *buf, struct rdt_resource *r) +int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d) { unsigned long data; - int ret; - ret = kstrtoul(buf, 16, &data); - if (ret) - return ret; - if (!cbm_validate(data, r)) + if (d->have_new_ctrl) return -EINVAL; - r->tmp_cbms[r->num_tmp_cbms++] = data; + + if(!cbm_validate(buf, &data, r)) + return -EINVAL; + d->new_ctrl = data; + d->have_new_ctrl = true; return 0; } @@ -74,8 +125,8 @@ static int parse_cbm(char *buf, struct rdt_resource *r) /* * For each domain in this resource we expect to find a series of: * id=mask - * separated by ";". The "id" is in decimal, and must appear in the - * right order. + * separated by ";". The "id" is in decimal, and must match one of + * the "id"s for this resource. */ static int parse_line(char *line, struct rdt_resource *r) { @@ -83,21 +134,22 @@ static int parse_line(char *line, struct rdt_resource *r) struct rdt_domain *d; unsigned long dom_id; +next: + if (!line || line[0] == '\0') + return 0; + dom = strsep(&line, ";"); + id = strsep(&dom, "="); + if (!dom || kstrtoul(id, 10, &dom_id)) + return -EINVAL; + dom = strim(dom); list_for_each_entry(d, &r->domains, list) { - dom = strsep(&line, ";"); - if (!dom) - return -EINVAL; - id = strsep(&dom, "="); - if (kstrtoul(id, 10, &dom_id) || dom_id != d->id) - return -EINVAL; - if (parse_cbm(dom, r)) - return -EINVAL; + if (d->id == dom_id) { + if (r->parse_ctrlval(dom, r, d)) + return -EINVAL; + goto next; + } } - - /* Any garbage at the end of the line? */ - if (line && line[0]) - return -EINVAL; - return 0; + return -EINVAL; } static int update_domains(struct rdt_resource *r, int closid) @@ -105,7 +157,7 @@ static int update_domains(struct rdt_resource *r, int closid) struct msr_param msr_param; cpumask_var_t cpu_mask; struct rdt_domain *d; - int cpu, idx = 0; + int cpu; if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -115,30 +167,46 @@ static int update_domains(struct rdt_resource *r, int closid) msr_param.res = r; list_for_each_entry(d, &r->domains, list) { - cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); - d->cbm[msr_param.low] = r->tmp_cbms[idx++]; + if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) { + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + d->ctrl_val[closid] = d->new_ctrl; + } } + if (cpumask_empty(cpu_mask)) + goto done; cpu = get_cpu(); /* Update CBM on this cpu if it's in cpu_mask. */ if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_cbm_update(&msr_param); + rdt_ctrl_update(&msr_param); /* Update CBM on other cpus. */ - smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); put_cpu(); +done: free_cpumask_var(cpu_mask); return 0; } +static int rdtgroup_parse_resource(char *resname, char *tok, int closid) +{ + struct rdt_resource *r; + + for_each_enabled_rdt_resource(r) { + if (!strcmp(resname, r->name) && closid < r->num_closid) + return parse_line(tok, r); + } + return -EINVAL; +} + ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdtgroup *rdtgrp; + struct rdt_domain *dom; struct rdt_resource *r; char *tok, *resname; int closid, ret = 0; - u32 *l3_cbms = NULL; /* Valid input requires a trailing newline */ if (nbytes == 0 || buf[nbytes - 1] != '\n') @@ -153,44 +221,20 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, closid = rdtgrp->closid; - /* get scratch space to save all the masks while we validate input */ for_each_enabled_rdt_resource(r) { - r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms), - GFP_KERNEL); - if (!r->tmp_cbms) { - ret = -ENOMEM; - goto out; - } - r->num_tmp_cbms = 0; + list_for_each_entry(dom, &r->domains, list) + dom->have_new_ctrl = false; } while ((tok = strsep(&buf, "\n")) != NULL) { - resname = strsep(&tok, ":"); + resname = strim(strsep(&tok, ":")); if (!tok) { ret = -EINVAL; goto out; } - for_each_enabled_rdt_resource(r) { - if (!strcmp(resname, r->name) && - closid < r->num_closid) { - ret = parse_line(tok, r); - if (ret) - goto out; - break; - } - } - if (!r->name) { - ret = -EINVAL; - goto out; - } - } - - /* Did the parser find all the masks we need? */ - for_each_enabled_rdt_resource(r) { - if (r->num_tmp_cbms != r->num_domains) { - ret = -EINVAL; + ret = rdtgroup_parse_resource(resname, tok, closid); + if (ret) goto out; - } } for_each_enabled_rdt_resource(r) { @@ -201,10 +245,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, out: rdtgroup_kn_unlock(of->kn); - for_each_enabled_rdt_resource(r) { - kfree(r->tmp_cbms); - r->tmp_cbms = NULL; - } return ret ?: nbytes; } @@ -213,11 +253,12 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid) struct rdt_domain *dom; bool sep = false; - seq_printf(s, "%s:", r->name); + seq_printf(s, "%*s:", max_name_width, r->name); list_for_each_entry(dom, &r->domains, list) { if (sep) seq_puts(s, ";"); - seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]); + seq_printf(s, r->format_str, dom->id, max_data_width, + dom->ctrl_val[closid]); sep = true; } seq_puts(s, "\n"); diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index a3311c886194..43051f0777d4 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o obj-$(CONFIG_ACPI_APEI) += mce-apei.o + +obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c new file mode 100644 index 000000000000..9c632cb88546 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c @@ -0,0 +1,397 @@ +/* + * /dev/mcelog driver + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/miscdevice.h> +#include <linux/slab.h> +#include <linux/kmod.h> +#include <linux/poll.h> + +#include "mce-internal.h" + +static DEFINE_MUTEX(mce_chrdev_read_mutex); + +static char mce_helper[128]; +static char *mce_helper_argv[2] = { mce_helper, NULL }; + +#define mce_log_get_idx_check(p) \ +({ \ + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&mce_chrdev_read_mutex), \ + "suspicious mce_log_get_idx_check() usage"); \ + smp_load_acquire(&(p)); \ +}) + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +static struct mce_log_buffer mcelog = { + .signature = MCE_LOG_SIGNATURE, + .len = MCE_LOG_LEN, + .recordlen = sizeof(struct mce), +}; + +static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); + +/* User mode helper program triggered by machine check event */ +extern char mce_helper[128]; + +static int dev_mce_log(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + unsigned int next, entry; + + wmb(); + for (;;) { + entry = mce_log_get_idx_check(mcelog.next); + for (;;) { + + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, + (unsigned long *)&mcelog.flags); + return NOTIFY_OK; + } + /* Old left over entry. Skip: */ + if (mcelog.entry[entry].finished) { + entry++; + continue; + } + break; + } + smp_rmb(); + next = entry + 1; + if (cmpxchg(&mcelog.next, entry, next) == entry) + break; + } + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); + wmb(); + mcelog.entry[entry].finished = 1; + wmb(); + + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&mce_chrdev_wait); + + return NOTIFY_OK; +} + +static struct notifier_block dev_mcelog_nb = { + .notifier_call = dev_mce_log, + .priority = MCE_PRIO_MCELOG, +}; + +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + + +void mce_work_trigger(void) +{ + if (mce_helper[0]) + schedule_work(&mce_trigger_work); +} + +static ssize_t +show_trigger(struct device *s, struct device_attribute *attr, char *buf) +{ + strcpy(buf, mce_helper); + strcat(buf, "\n"); + return strlen(mce_helper) + 1; +} + +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, + const char *buf, size_t siz) +{ + char *p; + + strncpy(mce_helper, buf, sizeof(mce_helper)); + mce_helper[sizeof(mce_helper)-1] = 0; + p = strchr(mce_helper, '\n'); + + if (p) + *p = 0; + + return strlen(mce_helper) + !!p; +} + +DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); + +/* + * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. + */ + +static DEFINE_SPINLOCK(mce_chrdev_state_lock); +static int mce_chrdev_open_count; /* #times opened */ +static int mce_chrdev_open_exclu; /* already open exclusive? */ + +static int mce_chrdev_open(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + if (mce_chrdev_open_exclu || + (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_chrdev_state_lock); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + mce_chrdev_open_exclu = 1; + mce_chrdev_open_count++; + + spin_unlock(&mce_chrdev_state_lock); + + return nonseekable_open(inode, file); +} + +static int mce_chrdev_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + mce_chrdev_open_count--; + mce_chrdev_open_exclu = 0; + + spin_unlock(&mce_chrdev_state_lock); + + return 0; +} + +static void collect_tscs(void *data) +{ + unsigned long *cpu_tsc = (unsigned long *)data; + + cpu_tsc[smp_processor_id()] = rdtsc(); +} + +static int mce_apei_read_done; + +/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ +static int __mce_read_apei(char __user **ubuf, size_t usize) +{ + int rc; + u64 record_id; + struct mce m; + + if (usize < sizeof(struct mce)) + return -EINVAL; + + rc = apei_read_mce(&m, &record_id); + /* Error or no more MCE record */ + if (rc <= 0) { + mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; + return rc; + } + rc = -EFAULT; + if (copy_to_user(*ubuf, &m, sizeof(struct mce))) + return rc; + /* + * In fact, we should have cleared the record after that has + * been flushed to the disk or sent to network in + * /sbin/mcelog, but we have no interface to support that now, + * so just clear it to avoid duplication. + */ + rc = apei_clear_mce(record_id); + if (rc) { + mce_apei_read_done = 1; + return rc; + } + *ubuf += sizeof(struct mce); + + return 0; +} + +static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) +{ + char __user *buf = ubuf; + unsigned long *cpu_tsc; + unsigned prev, next; + int i, err; + + cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); + if (!cpu_tsc) + return -ENOMEM; + + mutex_lock(&mce_chrdev_read_mutex); + + if (!mce_apei_read_done) { + err = __mce_read_apei(&buf, usize); + if (err || buf != ubuf) + goto out; + } + + next = mce_log_get_idx_check(mcelog.next); + + /* Only supports full reads right now */ + err = -EINVAL; + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) + goto out; + + err = 0; + prev = 0; + do { + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + struct mce *m = &mcelog.entry[i]; + + while (!m->finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(m, 0, sizeof(*m)); + goto timeout; + } + cpu_relax(); + } + smp_rmb(); + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); +timeout: + ; + } + + memset(mcelog.entry + prev, 0, + (next - prev) * sizeof(struct mce)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); + + synchronize_sched(); + + /* + * Collect entries that were still getting written before the + * synchronize. + */ + on_each_cpu(collect_tscs, cpu_tsc, 1); + + for (i = next; i < MCE_LOG_LEN; i++) { + struct mce *m = &mcelog.entry[i]; + + if (m->finished && m->tsc < cpu_tsc[m->cpu]) { + err |= copy_to_user(buf, m, sizeof(*m)); + smp_rmb(); + buf += sizeof(*m); + memset(m, 0, sizeof(*m)); + } + } + + if (err) + err = -EFAULT; + +out: + mutex_unlock(&mce_chrdev_read_mutex); + kfree(cpu_tsc); + + return err ? err : buf - ubuf; +} + +static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_chrdev_wait, wait); + if (READ_ONCE(mcelog.next)) + return POLLIN | POLLRDNORM; + if (!mce_apei_read_done && apei_check_mce()) + return POLLIN | POLLRDNORM; + return 0; +} + +static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = mcelog.flags; + } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off); + +void register_mce_write_callback(ssize_t (*fn)(struct file *filp, + const char __user *ubuf, + size_t usize, loff_t *off)) +{ + mce_write = fn; +} +EXPORT_SYMBOL_GPL(register_mce_write_callback); + +static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + if (mce_write) + return mce_write(filp, ubuf, usize, off); + else + return -EINVAL; +} + +static const struct file_operations mce_chrdev_ops = { + .open = mce_chrdev_open, + .release = mce_chrdev_release, + .read = mce_chrdev_read, + .write = mce_chrdev_write, + .poll = mce_chrdev_poll, + .unlocked_ioctl = mce_chrdev_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice mce_chrdev_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +static __init int dev_mcelog_init_device(void) +{ + int err; + + /* register character device /dev/mcelog */ + err = misc_register(&mce_chrdev_device); + if (err) { + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + return err; + } + mce_register_decode_chain(&dev_mcelog_nb); + return 0; +} +device_initcall_sync(dev_mcelog_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c index 1e5a50c11d3c..217cd4449bc9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -85,7 +85,7 @@ void mce_gen_pool_process(struct work_struct *__unused) head = llist_reverse_order(head); llist_for_each_entry_safe(node, tmp, head, llnode) { mce = &node->mce; - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); } } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 903043e6a62b..654ad0668d72 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -13,7 +13,7 @@ enum severity_level { MCE_PANIC_SEVERITY, }; -extern struct atomic_notifier_head x86_mce_decoder_chain; +extern struct blocking_notifier_head x86_mce_decoder_chain; #define ATTR_LEN 16 #define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ @@ -96,3 +96,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2) m1->addr != m2->addr || m1->misc != m2->misc; } + +extern struct device_attribute dev_attr_trigger; + +#ifdef CONFIG_X86_MCELOG_LEGACY +extern void mce_work_trigger(void); +#else +static inline void mce_work_trigger(void) { } +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8e9725c607ea..5abd4bf73d6e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -35,6 +35,7 @@ #include <linux/poll.h> #include <linux/nmi.h> #include <linux/cpu.h> +#include <linux/ras.h> #include <linux/smp.h> #include <linux/fs.h> #include <linux/mm.h> @@ -49,18 +50,11 @@ #include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> +#include <asm/reboot.h> #include "mce-internal.h" -static DEFINE_MUTEX(mce_chrdev_read_mutex); - -#define mce_log_get_idx_check(p) \ -({ \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ - !lockdep_is_held(&mce_chrdev_read_mutex), \ - "suspicious mce_log_get_idx_check() usage"); \ - smp_load_acquire(&(p)); \ -}) +static DEFINE_MUTEX(mce_log_mutex); #define CREATE_TRACE_POINTS #include <trace/events/mce.h> @@ -85,15 +79,9 @@ struct mca_config mca_cfg __read_mostly = { .monarch_timeout = -1 }; -/* User mode helper program triggered by machine check event */ -static unsigned long mce_need_notify; -static char mce_helper[128]; -static char *mce_helper_argv[2] = { mce_helper, NULL }; - -static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); - static DEFINE_PER_CPU(struct mce, mces_seen); -static int cpu_missing; +static unsigned long mce_need_notify; +static int cpu_missing; /* * MCA banks polled by the period polling timer for corrected events. @@ -121,7 +109,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) @@ -143,82 +131,38 @@ void mce_setup(struct mce *m) DEFINE_PER_CPU(struct mce, injectm); EXPORT_PER_CPU_SYMBOL_GPL(injectm); -/* - * Lockless MCE logging infrastructure. - * This avoids deadlocks on printk locks without having to break locks. Also - * separate MCEs from kernel messages to avoid bogus bug reports. - */ - -static struct mce_log mcelog = { - .signature = MCE_LOG_SIGNATURE, - .len = MCE_LOG_LEN, - .recordlen = sizeof(struct mce), -}; - -void mce_log(struct mce *mce) +void mce_log(struct mce *m) { - unsigned next, entry; - - /* Emit the trace record: */ - trace_mce_record(mce); - - if (!mce_gen_pool_add(mce)) + if (!mce_gen_pool_add(m)) irq_work_queue(&mce_irq_work); - - wmb(); - for (;;) { - entry = mce_log_get_idx_check(mcelog.next); - for (;;) { - - /* - * When the buffer fills up discard new entries. - * Assume that the earlier errors are the more - * interesting ones: - */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, - (unsigned long *)&mcelog.flags); - return; - } - /* Old left over entry. Skip: */ - if (mcelog.entry[entry].finished) { - entry++; - continue; - } - break; - } - smp_rmb(); - next = entry + 1; - if (cmpxchg(&mcelog.next, entry, next) == entry) - break; - } - memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - wmb(); - mcelog.entry[entry].finished = 1; - wmb(); - - set_bit(0, &mce_need_notify); } void mce_inject_log(struct mce *m) { - mutex_lock(&mce_chrdev_read_mutex); + mutex_lock(&mce_log_mutex); mce_log(m); - mutex_unlock(&mce_chrdev_read_mutex); + mutex_unlock(&mce_log_mutex); } EXPORT_SYMBOL_GPL(mce_inject_log); static struct notifier_block mce_srao_nb; +/* + * We run the default notifier if we have only the SRAO, the first and the + * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS + * notifiers registered on the chain. + */ +#define NUM_DEFAULT_NOTIFIERS 3 static atomic_t num_notifiers; void mce_register_decode_chain(struct notifier_block *nb) { - atomic_inc(&num_notifiers); + if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) + return; - WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC); + atomic_inc(&num_notifiers); - atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); + blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_register_decode_chain); @@ -226,7 +170,7 @@ void mce_unregister_decode_chain(struct notifier_block *nb) { atomic_dec(&num_notifiers); - atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); + blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); @@ -319,18 +263,7 @@ static void __print_mce(struct mce *m) static void print_mce(struct mce *m) { - int ret = 0; - __print_mce(m); - - /* - * Print out human-readable details about the MCE error, - * (if the CPU has an implementation for that) - */ - ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); - if (ret == NOTIFY_STOP) - return; - pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); } @@ -519,7 +452,6 @@ static void mce_schedule_work(void) static void mce_irq_work_cb(struct irq_work *entry) { - mce_notify_irq(); mce_schedule_work(); } @@ -548,20 +480,97 @@ static void mce_report_event(struct pt_regs *regs) */ static int mce_usable_address(struct mce *m) { - if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) + if (!(m->status & MCI_STATUS_ADDRV)) return 0; /* Checks after this one are Intel-specific: */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 1; + if (!(m->status & MCI_STATUS_MISCV)) + return 0; + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) return 0; + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) return 0; + return 1; } +static bool memory_error(struct mce *m) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86_vendor == X86_VENDOR_AMD) { + /* ErrCodeExt[20:16] */ + u8 xec = (m->status >> 16) & 0x1f; + + return (xec == 0x0 || xec == 0x8); + } else if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * Intel SDM Volume 3B - 15.9.2 Compound Error Codes + * + * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for + * indicating a memory error. Bit 8 is used for indicating a + * cache hierarchy error. The combination of bit 2 and bit 3 + * is used for indicating a `generic' cache hierarchy error + * But we can't just blindly check the above bits, because if + * bit 11 is set, then it is a bus/interconnect error - and + * either way the above bits just gives more detail on what + * bus/interconnect error happened. Note that bit 12 can be + * ignored, as it's the "filter" bit. + */ + return (m->status & 0xef80) == BIT(7) || + (m->status & 0xef00) == BIT(8) || + (m->status & 0xeffc) == 0xc; + } + + return false; +} + +static bool cec_add_mce(struct mce *m) +{ + if (!m) + return false; + + /* We eat only correctable DRAM errors with usable addresses. */ + if (memory_error(m) && + !(m->status & MCI_STATUS_UC) && + mce_usable_address(m)) + if (!cec_add_elem(m->addr >> PAGE_SHIFT)) + return true; + + return false; +} + +static int mce_first_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + if (cec_add_mce(m)) + return NOTIFY_STOP; + + /* Emit the trace record: */ + trace_mce_record(m); + + set_bit(0, &mce_need_notify); + + mce_notify_irq(); + + return NOTIFY_DONE; +} + +static struct notifier_block first_nb = { + .notifier_call = mce_first_notifier, + .priority = MCE_PRIO_FIRST, +}; + static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -591,11 +600,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val, if (!m) return NOTIFY_DONE; - /* - * Run the default notifier if we have only the SRAO - * notifier and us registered. - */ - if (atomic_read(&num_notifiers) > 2) + if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS) return NOTIFY_DONE; __print_mce(m); @@ -648,37 +653,6 @@ static void mce_read_aux(struct mce *m, int i) } } -static bool memory_error(struct mce *m) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - if (c->x86_vendor == X86_VENDOR_AMD) { - /* ErrCodeExt[20:16] */ - u8 xec = (m->status >> 16) & 0x1f; - - return (xec == 0x0 || xec == 0x8); - } else if (c->x86_vendor == X86_VENDOR_INTEL) { - /* - * Intel SDM Volume 3B - 15.9.2 Compound Error Codes - * - * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for - * indicating a memory error. Bit 8 is used for indicating a - * cache hierarchy error. The combination of bit 2 and bit 3 - * is used for indicating a `generic' cache hierarchy error - * But we can't just blindly check the above bits, because if - * bit 11 is set, then it is a bus/interconnect error - and - * either way the above bits just gives more detail on what - * bus/interconnect error happened. Note that bit 12 can be - * ignored, as it's the "filter" bit. - */ - return (m->status & 0xef80) == BIT(7) || - (m->status & 0xef00) == BIT(8) || - (m->status & 0xeffc) == 0xc; - } - - return false; -} - DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -1127,9 +1101,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) * on Intel. */ int lmce = 1; + int cpu = smp_processor_id(); - /* If this CPU is offline, just bail out. */ - if (cpu_is_offline(smp_processor_id())) { + /* + * Cases where we avoid rendezvous handler timeout: + * 1) If this CPU is offline. + * + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to + * skip those CPUs which remain looping in the 1st kernel - see + * crash_nmi_callback(). + * + * Note: there still is a small window between kexec-ing and the new, + * kdump kernel establishing a new #MC handler where a broadcasted MCE + * might not get handled properly. + */ + if (cpu_is_offline(cpu) || + (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); @@ -1399,13 +1386,6 @@ static void mce_timer_delete_all(void) del_timer_sync(&per_cpu(mce_timer, cpu)); } -static void mce_do_trigger(struct work_struct *work) -{ - call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); -} - -static DECLARE_WORK(mce_trigger_work, mce_do_trigger); - /* * Notify the user(s) about new machine check events. * Can be called from interrupt context, but not from machine check/NMI @@ -1417,11 +1397,7 @@ int mce_notify_irq(void) static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); if (test_and_clear_bit(0, &mce_need_notify)) { - /* wake processes polling /dev/mcelog */ - wake_up_interruptible(&mce_chrdev_wait); - - if (mce_helper[0]) - schedule_work(&mce_trigger_work); + mce_work_trigger(); if (__ratelimit(&ratelimit)) pr_info(HW_ERR "Machine check events logged\n"); @@ -1688,30 +1664,35 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) return 0; } -static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) +/* + * Init basic CPU features needed for early decoding of MCEs. + */ +static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) { - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - mce_intel_feature_init(c); - mce_adjust_timer = cmci_intel_adjust_timer; - break; - - case X86_VENDOR_AMD: { + if (c->x86_vendor == X86_VENDOR_AMD) { mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - /* - * Install proper ops for Scalable MCA enabled processors - */ if (mce_flags.smca) { msr_ops.ctl = smca_ctl_reg; msr_ops.status = smca_status_reg; msr_ops.addr = smca_addr_reg; msr_ops.misc = smca_misc_reg; } - mce_amd_feature_init(c); + } +} + +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + mce_adjust_timer = cmci_intel_adjust_timer; + break; + case X86_VENDOR_AMD: { + mce_amd_feature_init(c); break; } @@ -1798,6 +1779,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) machine_check_vector = do_machine_check; + __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_clear_banks(); @@ -1823,252 +1805,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c) } -/* - * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. - */ - -static DEFINE_SPINLOCK(mce_chrdev_state_lock); -static int mce_chrdev_open_count; /* #times opened */ -static int mce_chrdev_open_exclu; /* already open exclusive? */ - -static int mce_chrdev_open(struct inode *inode, struct file *file) -{ - spin_lock(&mce_chrdev_state_lock); - - if (mce_chrdev_open_exclu || - (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { - spin_unlock(&mce_chrdev_state_lock); - - return -EBUSY; - } - - if (file->f_flags & O_EXCL) - mce_chrdev_open_exclu = 1; - mce_chrdev_open_count++; - - spin_unlock(&mce_chrdev_state_lock); - - return nonseekable_open(inode, file); -} - -static int mce_chrdev_release(struct inode *inode, struct file *file) -{ - spin_lock(&mce_chrdev_state_lock); - - mce_chrdev_open_count--; - mce_chrdev_open_exclu = 0; - - spin_unlock(&mce_chrdev_state_lock); - - return 0; -} - -static void collect_tscs(void *data) -{ - unsigned long *cpu_tsc = (unsigned long *)data; - - cpu_tsc[smp_processor_id()] = rdtsc(); -} - -static int mce_apei_read_done; - -/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ -static int __mce_read_apei(char __user **ubuf, size_t usize) -{ - int rc; - u64 record_id; - struct mce m; - - if (usize < sizeof(struct mce)) - return -EINVAL; - - rc = apei_read_mce(&m, &record_id); - /* Error or no more MCE record */ - if (rc <= 0) { - mce_apei_read_done = 1; - /* - * When ERST is disabled, mce_chrdev_read() should return - * "no record" instead of "no device." - */ - if (rc == -ENODEV) - return 0; - return rc; - } - rc = -EFAULT; - if (copy_to_user(*ubuf, &m, sizeof(struct mce))) - return rc; - /* - * In fact, we should have cleared the record after that has - * been flushed to the disk or sent to network in - * /sbin/mcelog, but we have no interface to support that now, - * so just clear it to avoid duplication. - */ - rc = apei_clear_mce(record_id); - if (rc) { - mce_apei_read_done = 1; - return rc; - } - *ubuf += sizeof(struct mce); - - return 0; -} - -static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, - size_t usize, loff_t *off) -{ - char __user *buf = ubuf; - unsigned long *cpu_tsc; - unsigned prev, next; - int i, err; - - cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); - if (!cpu_tsc) - return -ENOMEM; - - mutex_lock(&mce_chrdev_read_mutex); - - if (!mce_apei_read_done) { - err = __mce_read_apei(&buf, usize); - if (err || buf != ubuf) - goto out; - } - - next = mce_log_get_idx_check(mcelog.next); - - /* Only supports full reads right now */ - err = -EINVAL; - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) - goto out; - - err = 0; - prev = 0; - do { - for (i = prev; i < next; i++) { - unsigned long start = jiffies; - struct mce *m = &mcelog.entry[i]; - - while (!m->finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(m, 0, sizeof(*m)); - goto timeout; - } - cpu_relax(); - } - smp_rmb(); - err |= copy_to_user(buf, m, sizeof(*m)); - buf += sizeof(*m); -timeout: - ; - } - - memset(mcelog.entry + prev, 0, - (next - prev) * sizeof(struct mce)); - prev = next; - next = cmpxchg(&mcelog.next, prev, 0); - } while (next != prev); - - synchronize_sched(); - - /* - * Collect entries that were still getting written before the - * synchronize. - */ - on_each_cpu(collect_tscs, cpu_tsc, 1); - - for (i = next; i < MCE_LOG_LEN; i++) { - struct mce *m = &mcelog.entry[i]; - - if (m->finished && m->tsc < cpu_tsc[m->cpu]) { - err |= copy_to_user(buf, m, sizeof(*m)); - smp_rmb(); - buf += sizeof(*m); - memset(m, 0, sizeof(*m)); - } - } - - if (err) - err = -EFAULT; - -out: - mutex_unlock(&mce_chrdev_read_mutex); - kfree(cpu_tsc); - - return err ? err : buf - ubuf; -} - -static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) -{ - poll_wait(file, &mce_chrdev_wait, wait); - if (READ_ONCE(mcelog.next)) - return POLLIN | POLLRDNORM; - if (!mce_apei_read_done && apei_check_mce()) - return POLLIN | POLLRDNORM; - return 0; -} - -static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, - unsigned long arg) -{ - int __user *p = (int __user *)arg; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd) { - case MCE_GET_RECORD_LEN: - return put_user(sizeof(struct mce), p); - case MCE_GET_LOG_LEN: - return put_user(MCE_LOG_LEN, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - - do { - flags = mcelog.flags; - } while (cmpxchg(&mcelog.flags, flags, 0) != flags); - - return put_user(flags, p); - } - default: - return -ENOTTY; - } -} - -static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off); - -void register_mce_write_callback(ssize_t (*fn)(struct file *filp, - const char __user *ubuf, - size_t usize, loff_t *off)) -{ - mce_write = fn; -} -EXPORT_SYMBOL_GPL(register_mce_write_callback); - -static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off) -{ - if (mce_write) - return mce_write(filp, ubuf, usize, off); - else - return -EINVAL; -} - -static const struct file_operations mce_chrdev_ops = { - .open = mce_chrdev_open, - .release = mce_chrdev_release, - .read = mce_chrdev_read, - .write = mce_chrdev_write, - .poll = mce_chrdev_poll, - .unlocked_ioctl = mce_chrdev_ioctl, - .llseek = no_llseek, -}; - -static struct miscdevice mce_chrdev_device = { - MISC_MCELOG_MINOR, - "mcelog", - &mce_chrdev_ops, -}; - static void __mce_disable_bank(void *arg) { int bank = *((int *)arg); @@ -2142,6 +1878,7 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { mcheck_intel_therm_init(); + mce_register_decode_chain(&first_nb); mce_register_decode_chain(&mce_srao_nb); mce_register_decode_chain(&mce_default_nb); mcheck_vendor_init_severity(); @@ -2286,29 +2023,6 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, return size; } -static ssize_t -show_trigger(struct device *s, struct device_attribute *attr, char *buf) -{ - strcpy(buf, mce_helper); - strcat(buf, "\n"); - return strlen(mce_helper) + 1; -} - -static ssize_t set_trigger(struct device *s, struct device_attribute *attr, - const char *buf, size_t siz) -{ - char *p; - - strncpy(mce_helper, buf, sizeof(mce_helper)); - mce_helper[sizeof(mce_helper)-1] = 0; - p = strchr(mce_helper, '\n'); - - if (p) - *p = 0; - - return strlen(mce_helper) + !!p; -} - static ssize_t set_ignore_ce(struct device *s, struct device_attribute *attr, const char *buf, size_t size) @@ -2365,7 +2079,6 @@ static ssize_t store_int_with_restart(struct device *s, return ret; } -static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); @@ -2388,7 +2101,9 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = { static struct device_attribute *mce_device_attrs[] = { &dev_attr_tolerant.attr, &dev_attr_check_interval.attr, +#ifdef CONFIG_X86_MCELOG_LEGACY &dev_attr_trigger, +#endif &dev_attr_monarch_timeout.attr, &dev_attr_dont_log_ce.attr, &dev_attr_ignore_ce.attr, @@ -2562,7 +2277,6 @@ static __init void mce_init_banks(void) static __init int mcheck_init_device(void) { - enum cpuhp_state hp_online; int err; if (!mce_available(&boot_cpu_data)) { @@ -2590,21 +2304,11 @@ static __init int mcheck_init_device(void) mce_cpu_online, mce_cpu_pre_down); if (err < 0) goto err_out_online; - hp_online = err; register_syscore_ops(&mce_syscore_ops); - /* register character device /dev/mcelog */ - err = misc_register(&mce_chrdev_device); - if (err) - goto err_register; - return 0; -err_register: - unregister_syscore_ops(&mce_syscore_ops); - cpuhp_remove_state(hp_online); - err_out_online: cpuhp_remove_state(CPUHP_X86_MCE_DEAD); @@ -2612,7 +2316,7 @@ err_out_mem: free_cpumask_var(mce_device_initialized); err_out: - pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + pr_err("Unable to init MCE device (rc: %d)\n", err); return err; } @@ -2691,6 +2395,7 @@ static int __init mcheck_late_init(void) static_branch_inc(&mcsafe_key); mcheck_debugfs_init(); + cec_init(); /* * Flush out everything that has been logged during early boot, now that diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 524cc5780a77..6e4a047e4b68 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -60,7 +60,7 @@ static const char * const th_names[] = { "load_store", "insn_fetch", "combined_unit", - "", + "decode_unit", "northbridge", "execution_unit", }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 190b3e6cef4d..e84db79ef272 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -481,6 +481,9 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) case INTEL_FAM6_BROADWELL_XEON_D: case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: + if (rdmsrl_safe(MSR_PPIN_CTL, &val)) return; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index b5375b9497b3..04cb8d34ccb8 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -49,6 +49,9 @@ void hyperv_vector_handler(struct pt_regs *regs) if (vmbus_handler) vmbus_handler(); + if (ms_hyperv.hints & HV_X64_DEPRECATING_AEOI_RECOMMENDED) + ack_APIC_irq(); + exiting_irq(); set_irq_regs(old_regs); } diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 3b442b64c72d..765afd599039 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -27,7 +27,7 @@ #include <linux/range.h> #include <asm/processor.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/msr.h> @@ -860,7 +860,7 @@ real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) trim_size <<= PAGE_SHIFT; trim_size -= trim_start; - return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED); + return e820__range_update(trim_start, trim_size, E820_TYPE_RAM, E820_TYPE_RESERVED); } /** @@ -978,7 +978,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) WARN_ON(1); pr_info("update e820 for mtrr\n"); - update_e820(); + e820__update_table_print(); return 1; } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 24e87e74990d..2bce84d91c2b 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -48,7 +48,7 @@ #include <linux/syscore_ops.h> #include <asm/cpufeature.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/msr.h> #include <asm/pat.h> diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 18ca99f2798b..6df621ae62a7 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -31,14 +31,13 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) "fpu\t\t: %s\n" "fpu_exception\t: %s\n" "cpuid level\t: %d\n" - "wp\t\t: %s\n", + "wp\t\t: yes\n", static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", - c->cpuid_level, - c->wp_works_ok ? "yes" : "no"); + c->cpuid_level); } #else static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index d9794060fe22..23c23508c012 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, + { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 8457b4978668..d77d07ab310b 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -16,8 +16,6 @@ static void early_init_transmeta(struct cpuinfo_x86 *c) if (xlvl >= 0x80860001) c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001); } - - clear_sched_clock_stable(); } static void init_transmeta(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 891f4dad7b2c..40ed26852ebd 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -30,7 +30,6 @@ #include <asm/hypervisor.h> #include <asm/timer.h> #include <asm/apic.h> -#include <asm/timer.h> #undef pr_fmt #define pr_fmt(fmt) "vmware: " fmt @@ -114,6 +113,24 @@ static void __init vmware_paravirt_ops_setup(void) #define vmware_paravirt_ops_setup() do {} while (0) #endif +/* + * VMware hypervisor takes care of exporting a reliable TSC to the guest. + * Still, due to timing difference when running on virtual cpus, the TSC can + * be marked as unstable in some cases. For example, the TSC sync check at + * bootup can fail due to a marginal offset between vcpus' TSCs (though the + * TSCs do not drift from each other). Also, the ACPI PM timer clocksource + * is not suitable as a watchdog when running on a hypervisor because the + * kernel may miss a wrap of the counter if the vcpu is descheduled for a + * long time. To skip these checks at runtime we set these capability bits, + * so that the kernel could just trust the hypervisor with providing a + * reliable virtual TSC that is suitable for timekeeping. + */ +static void __init vmware_set_capabilities(void) +{ + setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); +} + static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; @@ -153,6 +170,8 @@ static void __init vmware_platform_setup(void) #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; #endif + + vmware_set_capabilities(); } /* @@ -177,24 +196,6 @@ static uint32_t __init vmware_platform(void) return 0; } -/* - * VMware hypervisor takes care of exporting a reliable TSC to the guest. - * Still, due to timing difference when running on virtual cpus, the TSC can - * be marked as unstable in some cases. For example, the TSC sync check at - * bootup can fail due to a marginal offset between vcpus' TSCs (though the - * TSCs do not drift from each other). Also, the ACPI PM timer clocksource - * is not suitable as a watchdog when running on a hypervisor because the - * kernel may miss a wrap of the counter if the vcpu is descheduled for a - * long time. To skip these checks at runtime we set these capability bits, - * so that the kernel could just trust the hypervisor with providing a - * reliable virtual TSC that is suitable for timekeeping. - */ -static void vmware_set_cpu_features(struct cpuinfo_x86 *c) -{ - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); -} - /* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */ static bool __init vmware_legacy_x2apic_available(void) { @@ -207,7 +208,6 @@ static bool __init vmware_legacy_x2apic_available(void) const __refconst struct hypervisor_x86 x86_hyper_vmware = { .name = "VMware", .detect = vmware_platform, - .set_cpu_features = vmware_set_cpu_features, .init_platform = vmware_platform_setup, .x2apic_available = vmware_legacy_x2apic_available, }; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 3741461c63a0..22217ece26c8 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -29,6 +29,7 @@ #include <asm/nmi.h> #include <asm/hw_irq.h> #include <asm/apic.h> +#include <asm/e820/types.h> #include <asm/io_apic.h> #include <asm/hpet.h> #include <linux/kdebug.h> @@ -503,16 +504,16 @@ static int prepare_elf_headers(struct kimage *image, void **addr, return ret; } -static int add_e820_entry(struct boot_params *params, struct e820entry *entry) +static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) { unsigned int nr_e820_entries; nr_e820_entries = params->e820_entries; - if (nr_e820_entries >= E820MAX) + if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE) return 1; - memcpy(¶ms->e820_map[nr_e820_entries], entry, - sizeof(struct e820entry)); + memcpy(¶ms->e820_table[nr_e820_entries], entry, + sizeof(struct e820_entry)); params->e820_entries++; return 0; } @@ -521,7 +522,7 @@ static int memmap_entry_callback(u64 start, u64 end, void *arg) { struct crash_memmap_data *cmd = arg; struct boot_params *params = cmd->params; - struct e820entry ei; + struct e820_entry ei; ei.addr = start; ei.size = end - start + 1; @@ -560,7 +561,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) { int i, ret = 0; unsigned long flags; - struct e820entry ei; + struct e820_entry ei; struct crash_memmap_data cmd; struct crash_mem *cmem; @@ -574,17 +575,17 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) /* Add first 640K segment */ ei.addr = image->arch.backup_src_start; ei.size = image->arch.backup_src_sz; - ei.type = E820_RAM; + ei.type = E820_TYPE_RAM; add_e820_entry(params, &ei); /* Add ACPI tables */ - cmd.type = E820_ACPI; + cmd.type = E820_TYPE_ACPI; flags = IORESOURCE_MEM | IORESOURCE_BUSY; walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd, memmap_entry_callback); /* Add ACPI Non-volatile Storage */ - cmd.type = E820_NVS; + cmd.type = E820_TYPE_NVS; walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, memmap_entry_callback); @@ -592,7 +593,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) if (crashk_low_res.end) { ei.addr = crashk_low_res.start; ei.size = crashk_low_res.end - crashk_low_res.start + 1; - ei.type = E820_RAM; + ei.type = E820_TYPE_RAM; add_e820_entry(params, &ei); } @@ -609,7 +610,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) if (ei.size < PAGE_SIZE) continue; ei.addr = cmem->ranges[i].start; - ei.type = E820_RAM; + ei.type = E820_TYPE_RAM; add_e820_entry(params, &ei); } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 09d4ac0d2661..dbce3cca94cb 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -77,7 +77,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - softirq stack * - hardirq stack */ - for (regs = NULL; stack; stack = stack_info.next_sp) { + for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { const char *stack_name; /* @@ -289,9 +289,6 @@ void die(const char *str, struct pt_regs *regs, long err) unsigned long flags = oops_begin(); int sig = SIGSEGV; - if (!user_mode(regs)) - report_bug(regs->ip, regs); - if (__die(str, regs, err)) sig = 0; oops_end(flags, regs, sig); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index b0b3a3df7c20..e5f0b40e66d2 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -162,15 +162,3 @@ void show_regs(struct pt_regs *regs) } pr_cont("\n"); } - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (ip < PAGE_OFFSET) - return 0; - if (probe_kernel_address((unsigned short *)ip, ud2)) - return 0; - - return ud2 == 0x0b0f; -} diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index a8b117e93b46..3e1471d57487 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -178,13 +178,3 @@ void show_regs(struct pt_regs *regs) } pr_cont("\n"); } - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) - return 0; - - return ud2 == 0x0b0f; -} diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b2bbad6ebe4d..d78a586ba8dc 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1,49 +1,55 @@ /* - * Handle the memory map. - * The functions here do the job until bootmem takes over. + * Low level x86 E820 memory map handling functions. * - * Getting sanitize_e820_map() in sync with i386 version by applying change: - * - Provisions for empty E820 memory regions (reported by certain BIOSes). - * Alex Achenbach <xela@slit.de>, December 2002. - * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * The firmware and bootloader passes us the "E820 table", which is the primary + * physical memory layout description available about x86 systems. * + * The kernel takes the E820 memory layout and optionally modifies it with + * quirks and other tweaks, and feeds that into the generic Linux memory + * allocation code routines via a platform independent interface (memblock, etc.). */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> #include <linux/crash_dump.h> -#include <linux/export.h> #include <linux/bootmem.h> -#include <linux/pfn.h> #include <linux/suspend.h> #include <linux/acpi.h> #include <linux/firmware-map.h> #include <linux/memblock.h> #include <linux/sort.h> -#include <asm/e820.h> -#include <asm/proto.h> +#include <asm/e820/api.h> #include <asm/setup.h> -#include <asm/cpufeature.h> /* - * The e820 map is the map that gets modified e.g. with command line parameters - * and that is also registered with modifications in the kernel resource tree - * with the iomem_resource as parent. + * We organize the E820 table into two main data structures: * - * The e820_saved is directly saved after the BIOS-provided memory map is - * copied. It doesn't get modified afterwards. It's registered for the - * /sys/firmware/memmap interface. + * - 'e820_table_firmware': the original firmware version passed to us by the + * bootloader - not modified by the kernel. We use this to: * - * That memory map is not modified and is used as base for kexec. The kexec'd - * kernel should get the same memory map as the firmware provides. Then the - * user can e.g. boot the original kernel with mem=1G while still booting the - * next kernel with full memory. + * - inform the user about the firmware's notion of memory layout + * via /sys/firmware/memmap + * + * - the hibernation code uses it to generate a kernel-independent MD5 + * fingerprint of the physical memory layout of a system. + * + * - kexec, which is a bootloader in disguise, uses the original E820 + * layout to pass to the kexec-ed kernel. This way the original kernel + * can have a restricted E820 map while the kexec()-ed kexec-kernel + * can have access to full memory - etc. + * + * - 'e820_table': this is the main E820 table that is massaged by the + * low level x86 platform code, or modified by boot parameters, before + * passed on to higher level MM layers. + * + * Once the E820 map has been converted to the standard Linux memory layout + * information its role stops - modifying it has no effect and does not get + * re-propagated. So itsmain role is a temporary bootstrap storage of firmware + * specific memory layout data during early bootup. */ -static struct e820map initial_e820 __initdata; -static struct e820map initial_e820_saved __initdata; -struct e820map *e820 __refdata = &initial_e820; -struct e820map *e820_saved __refdata = &initial_e820_saved; +static struct e820_table e820_table_init __initdata; +static struct e820_table e820_table_firmware_init __initdata; + +struct e820_table *e820_table __refdata = &e820_table_init; +struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -55,51 +61,53 @@ EXPORT_SYMBOL(pci_mem_start); * This function checks if any part of the range <start,end> is mapped * with type. */ -int -e820_any_mapped(u64 start, u64 end, unsigned type) +bool e820__mapped_any(u64 start, u64 end, enum e820_type type) { int i; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (type && ei->type != type) + if (type && entry->type != type) continue; - if (ei->addr >= end || ei->addr + ei->size <= start) + if (entry->addr >= end || entry->addr + entry->size <= start) continue; return 1; } return 0; } -EXPORT_SYMBOL_GPL(e820_any_mapped); +EXPORT_SYMBOL_GPL(e820__mapped_any); /* - * This function checks if the entire range <start,end> is mapped with type. + * This function checks if the entire <start,end> range is mapped with 'type'. * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case + * Note: this function only works correctly once the E820 table is sorted and + * not-overlapping (at least for the range specified), which is the case normally. */ -int __init e820_all_mapped(u64 start, u64 end, unsigned type) +bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) { int i; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (type && ei->type != type) + if (type && entry->type != type) continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) + + /* Is the region (part) in overlap with the current region? */ + if (entry->addr >= end || entry->addr + entry->size <= start) continue; - /* if the region is at the beginning of <start,end> we move - * start to the end of the region since it's ok until there + /* + * If the region is at the beginning of <start,end> we move + * 'start' to the end of the region since it's ok until there */ - if (ei->addr <= start) - start = ei->addr + ei->size; + if (entry->addr <= start) + start = entry->addr + entry->size; + /* - * if start is now at or beyond end, we're done, full - * coverage + * If 'start' is now at or beyond 'end', we're done, full + * coverage of the desired range exists: */ if (start >= end) return 1; @@ -108,94 +116,77 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) } /* - * Add a memory region to the kernel e820 map. + * Add a memory region to the kernel E820 map. */ -static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, - int type) +static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type) { - int x = e820x->nr_map; + int x = table->nr_entries; - if (x >= ARRAY_SIZE(e820x->map)) { - printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", - (unsigned long long) start, - (unsigned long long) (start + size - 1)); + if (x >= ARRAY_SIZE(table->entries)) { + pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1); return; } - e820x->map[x].addr = start; - e820x->map[x].size = size; - e820x->map[x].type = type; - e820x->nr_map++; + table->entries[x].addr = start; + table->entries[x].size = size; + table->entries[x].type = type; + table->nr_entries++; } -void __init e820_add_region(u64 start, u64 size, int type) +void __init e820__range_add(u64 start, u64 size, enum e820_type type) { - __e820_add_region(e820, start, size, type); + __e820__range_add(e820_table, start, size, type); } -static void __init e820_print_type(u32 type) +static void __init e820_print_type(enum e820_type type) { switch (type) { - case E820_RAM: - case E820_RESERVED_KERN: - printk(KERN_CONT "usable"); - break; - case E820_RESERVED: - printk(KERN_CONT "reserved"); - break; - case E820_ACPI: - printk(KERN_CONT "ACPI data"); - break; - case E820_NVS: - printk(KERN_CONT "ACPI NVS"); - break; - case E820_UNUSABLE: - printk(KERN_CONT "unusable"); - break; - case E820_PMEM: - case E820_PRAM: - printk(KERN_CONT "persistent (type %u)", type); - break; - default: - printk(KERN_CONT "type %u", type); - break; + case E820_TYPE_RAM: /* Fall through: */ + case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break; + case E820_TYPE_RESERVED: pr_cont("reserved"); break; + case E820_TYPE_ACPI: pr_cont("ACPI data"); break; + case E820_TYPE_NVS: pr_cont("ACPI NVS"); break; + case E820_TYPE_UNUSABLE: pr_cont("unusable"); break; + case E820_TYPE_PMEM: /* Fall through: */ + case E820_TYPE_PRAM: pr_cont("persistent (type %u)", type); break; + default: pr_cont("type %u", type); break; } } -void __init e820_print_map(char *who) +void __init e820__print_table(char *who) { int i; - for (i = 0; i < e820->nr_map; i++) { - printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, - (unsigned long long) e820->map[i].addr, - (unsigned long long) - (e820->map[i].addr + e820->map[i].size - 1)); - e820_print_type(e820->map[i].type); - printk(KERN_CONT "\n"); + for (i = 0; i < e820_table->nr_entries; i++) { + pr_info("%s: [mem %#018Lx-%#018Lx] ", who, + e820_table->entries[i].addr, + e820_table->entries[i].addr + e820_table->entries[i].size - 1); + + e820_print_type(e820_table->entries[i].type); + pr_cont("\n"); } } /* - * Sanitize the BIOS e820 map. + * Sanitize an E820 map. * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps, + * Some E820 layouts include overlapping entries. The following + * replaces the original E820 map with a new one, removing overlaps, * and resolving conflicting memory types in favor of highest * numbered type. * - * The input parameter biosmap points to an array of 'struct - * e820entry' which on entry has elements in the range [0, *pnr_map) - * valid, and which has space for up to max_nr_map entries. - * On return, the resulting sanitized e820 map entries will be in - * overwritten in the same location, starting at biosmap. + * The input parameter 'entries' points to an array of 'struct + * e820_entry' which on entry has elements in the range [0, *nr_entries) + * valid, and which has space for up to max_nr_entries entries. + * On return, the resulting sanitized E820 map entries will be in + * overwritten in the same location, starting at 'entries'. * - * The integer pointed to by pnr_map must be valid on entry (the - * current number of valid entries located at biosmap). If the - * sanitizing succeeds the *pnr_map will be updated with the new - * number of valid entries (something no more than max_nr_map). + * The integer pointed to by nr_entries must be valid on entry (the + * current number of valid entries located at 'entries'). If the + * sanitizing succeeds the *nr_entries will be updated with the new + * number of valid entries (something no more than max_nr_entries). * - * The return value from sanitize_e820_map() is zero if it + * The return value from e820__update_table() is zero if it * successfully 'sanitized' the map entries passed in, and is -1 * if it did nothing, which can happen if either of (1) it was * only passed one map entry, or (2) any of the input map entries @@ -238,10 +229,17 @@ void __init e820_print_map(char *who) * ______________________4_ */ struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ + /* Pointer to the original entry: */ + struct e820_entry *entry; + /* Address for this change point: */ + unsigned long long addr; }; +static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata; +static struct change_member *change_point[2*E820_MAX_ENTRIES] __initdata; +static struct e820_entry *overlap_list[E820_MAX_ENTRIES] __initdata; +static struct e820_entry new_entries[E820_MAX_ENTRIES] __initdata; + static int __init cpcompare(const void *a, const void *b) { struct change_member * const *app = a, * const *bpp = b; @@ -249,164 +247,140 @@ static int __init cpcompare(const void *a, const void *b) /* * Inputs are pointers to two elements of change_point[]. If their - * addresses are unequal, their difference dominates. If the addresses + * addresses are not equal, their difference dominates. If the addresses * are equal, then consider one that represents the end of its region * to be greater than one that does not. */ if (ap->addr != bp->addr) return ap->addr > bp->addr ? 1 : -1; - return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr); + return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr); } -int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, - u32 *pnr_map) +int __init e820__update_table(struct e820_table *table) { - static struct change_member change_point_list[2*E820_X_MAX] __initdata; - static struct change_member *change_point[2*E820_X_MAX] __initdata; - static struct e820entry *overlap_list[E820_X_MAX] __initdata; - static struct e820entry new_bios[E820_X_MAX] __initdata; - unsigned long current_type, last_type; + struct e820_entry *entries = table->entries; + u32 max_nr_entries = ARRAY_SIZE(table->entries); + enum e820_type current_type, last_type; unsigned long long last_addr; - int chgidx; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; + u32 new_nr_entries, overlap_entries; + u32 i, chg_idx, chg_nr; - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) + /* If there's only one memory region, don't bother: */ + if (table->nr_entries < 2) return -1; - old_nr = *pnr_map; - BUG_ON(old_nr > max_nr_map); + BUG_ON(table->nr_entries > max_nr_entries); - /* bail out if we find any unreasonable addresses in bios map */ - for (i = 0; i < old_nr; i++) - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) + /* Bail out if we find any unreasonable addresses in the map: */ + for (i = 0; i < table->nr_entries; i++) { + if (entries[i].addr + entries[i].size < entries[i].addr) return -1; + } - /* create pointers for initial change-point information (for sorting) */ - for (i = 0; i < 2 * old_nr; i++) + /* Create pointers for initial change-point information (for sorting): */ + for (i = 0; i < 2 * table->nr_entries; i++) change_point[i] = &change_point_list[i]; - /* record all known change-points (starting and ending addresses), - omitting those that are for empty memory regions */ - chgidx = 0; - for (i = 0; i < old_nr; i++) { - if (biosmap[i].size != 0) { - change_point[chgidx]->addr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + - biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; + /* + * Record all known change-points (starting and ending addresses), + * omitting empty memory regions: + */ + chg_idx = 0; + for (i = 0; i < table->nr_entries; i++) { + if (entries[i].size != 0) { + change_point[chg_idx]->addr = entries[i].addr; + change_point[chg_idx++]->entry = &entries[i]; + change_point[chg_idx]->addr = entries[i].addr + entries[i].size; + change_point[chg_idx++]->entry = &entries[i]; } } - chg_nr = chgidx; - - /* sort change-point list by memory addresses (low -> high) */ - sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL); - - /* create a new bios memory map, removing overlaps */ - overlap_entries = 0; /* number of entries in the overlap table */ - new_bios_entry = 0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - - /* loop through change-points, determining affect on the new bios map */ - for (chgidx = 0; chgidx < chg_nr; chgidx++) { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == - change_point[chgidx]->pbios->addr) { - /* - * add map entry to overlap list (> 1 entry - * implies an overlap) - */ - overlap_list[overlap_entries++] = - change_point[chgidx]->pbios; + chg_nr = chg_idx; + + /* Sort change-point list by memory addresses (low -> high): */ + sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL); + + /* Create a new memory map, removing overlaps: */ + overlap_entries = 0; /* Number of entries in the overlap table */ + new_nr_entries = 0; /* Index for creating new map entries */ + last_type = 0; /* Start with undefined memory type */ + last_addr = 0; /* Start with 0 as last starting address */ + + /* Loop through change-points, determining effect on the new map: */ + for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) { + /* Keep track of all overlapping entries */ + if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) { + /* Add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++] = change_point[chg_idx]->entry; } else { - /* - * remove entry from list (order independent, - * so swap with last) - */ + /* Remove entry from list (order independent, so swap with last): */ for (i = 0; i < overlap_entries; i++) { - if (overlap_list[i] == - change_point[chgidx]->pbios) - overlap_list[i] = - overlap_list[overlap_entries-1]; + if (overlap_list[i] == change_point[chg_idx]->entry) + overlap_list[i] = overlap_list[overlap_entries-1]; } overlap_entries--; } /* - * if there are overlapping entries, decide which + * If there are overlapping entries, decide which * "type" to use (larger value takes precedence -- * 1=usable, 2,3,4,4+=unusable) */ current_type = 0; - for (i = 0; i < overlap_entries; i++) + for (i = 0; i < overlap_entries; i++) { if (overlap_list[i]->type > current_type) current_type = overlap_list[i]->type; - /* - * continue building up new bios map based on this - * information - */ - if (current_type != last_type || current_type == E820_PRAM) { + } + + /* Continue building up new map based on this information: */ + if (current_type != last_type || current_type == E820_TYPE_PRAM) { if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* - * move forward only if the new size - * was non-zero - */ - if (new_bios[new_bios_entry].size != 0) - /* - * no more space left for new - * bios entries ? - */ - if (++new_bios_entry >= max_nr_map) + new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr; + /* Move forward only if the new size was non-zero: */ + if (new_entries[new_nr_entries].size != 0) + /* No more space left for new entries? */ + if (++new_nr_entries >= max_nr_entries) break; } if (current_type != 0) { - new_bios[new_bios_entry].addr = - change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr = change_point[chgidx]->addr; + new_entries[new_nr_entries].addr = change_point[chg_idx]->addr; + new_entries[new_nr_entries].type = current_type; + last_addr = change_point[chg_idx]->addr; } last_type = current_type; } } - /* retain count for new bios entries */ - new_nr = new_bios_entry; - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); - *pnr_map = new_nr; + /* Copy the new entries into the original location: */ + memcpy(entries, new_entries, new_nr_entries*sizeof(*entries)); + table->nr_entries = new_nr_entries; return 0; } -static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) +static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) { - while (nr_map) { - u64 start = biosmap->addr; - u64 size = biosmap->size; + struct boot_e820_entry *entry = entries; + + while (nr_entries) { + u64 start = entry->addr; + u64 size = entry->size; u64 end = start + size - 1; - u32 type = biosmap->type; + u32 type = entry->type; - /* Overflow in 64 bits? Ignore the memory map. */ + /* Ignore the entry on 64-bit overflow: */ if (start > end && likely(size)) return -1; - e820_add_region(start, size, type); + e820__range_add(start, size, type); - biosmap++; - nr_map--; + entry++; + nr_entries--; } return 0; } /* - * Copy the BIOS e820 map into a safe place. + * Copy the BIOS E820 map into a safe place. * * Sanity-check it while we're at it.. * @@ -414,18 +388,17 @@ static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. */ -static int __init append_e820_map(struct e820entry *biosmap, int nr_map) +static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) { /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) + if (nr_entries < 2) return -1; - return __append_e820_map(biosmap, nr_map); + return __append_e820_table(entries, nr_entries); } -static u64 __init __e820_update_range(struct e820map *e820x, u64 start, - u64 size, unsigned old_type, - unsigned new_type) +static u64 __init +__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { u64 end; unsigned int i; @@ -437,77 +410,73 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", - (unsigned long long) start, (unsigned long long) (end - 1)); + printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1); e820_print_type(old_type); - printk(KERN_CONT " ==> "); + pr_cont(" ==> "); e820_print_type(new_type); - printk(KERN_CONT "\n"); + pr_cont("\n"); - for (i = 0; i < e820x->nr_map; i++) { - struct e820entry *ei = &e820x->map[i]; + for (i = 0; i < table->nr_entries; i++) { + struct e820_entry *entry = &table->entries[i]; u64 final_start, final_end; - u64 ei_end; + u64 entry_end; - if (ei->type != old_type) + if (entry->type != old_type) continue; - ei_end = ei->addr + ei->size; - /* totally covered by new range? */ - if (ei->addr >= start && ei_end <= end) { - ei->type = new_type; - real_updated_size += ei->size; + entry_end = entry->addr + entry->size; + + /* Completely covered by new range? */ + if (entry->addr >= start && entry_end <= end) { + entry->type = new_type; + real_updated_size += entry->size; continue; } - /* new range is totally covered? */ - if (ei->addr < start && ei_end > end) { - __e820_add_region(e820x, start, size, new_type); - __e820_add_region(e820x, end, ei_end - end, ei->type); - ei->size = start - ei->addr; + /* New range is completely covered? */ + if (entry->addr < start && entry_end > end) { + __e820__range_add(table, start, size, new_type); + __e820__range_add(table, end, entry_end - end, entry->type); + entry->size = start - entry->addr; real_updated_size += size; continue; } - /* partially covered */ - final_start = max(start, ei->addr); - final_end = min(end, ei_end); + /* Partially covered: */ + final_start = max(start, entry->addr); + final_end = min(end, entry_end); if (final_start >= final_end) continue; - __e820_add_region(e820x, final_start, final_end - final_start, - new_type); + __e820__range_add(table, final_start, final_end - final_start, new_type); real_updated_size += final_end - final_start; /* - * left range could be head or tail, so need to update - * size at first. + * Left range could be head or tail, so need to update + * its size first: */ - ei->size -= final_end - final_start; - if (ei->addr < final_start) + entry->size -= final_end - final_start; + if (entry->addr < final_start) continue; - ei->addr = final_end; + + entry->addr = final_end; } return real_updated_size; } -u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, - unsigned new_type) +u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { - return __e820_update_range(e820, start, size, old_type, new_type); + return __e820__range_update(e820_table, start, size, old_type, new_type); } -static u64 __init e820_update_range_saved(u64 start, u64 size, - unsigned old_type, unsigned new_type) +static u64 __init e820__range_update_firmware(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { - return __e820_update_range(e820_saved, start, size, old_type, - new_type); + return __e820__range_update(e820_table_firmware, start, size, old_type, new_type); } -/* make e820 not cover the range */ -u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, - int checktype) +/* Remove a range of memory from the E820 table: */ +u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type) { int i; u64 end; @@ -517,85 +486,89 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", - (unsigned long long) start, (unsigned long long) (end - 1)); - if (checktype) + printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1); + if (check_type) e820_print_type(old_type); - printk(KERN_CONT "\n"); + pr_cont("\n"); - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; u64 final_start, final_end; - u64 ei_end; + u64 entry_end; - if (checktype && ei->type != old_type) + if (check_type && entry->type != old_type) continue; - ei_end = ei->addr + ei->size; - /* totally covered? */ - if (ei->addr >= start && ei_end <= end) { - real_removed_size += ei->size; - memset(ei, 0, sizeof(struct e820entry)); + entry_end = entry->addr + entry->size; + + /* Completely covered? */ + if (entry->addr >= start && entry_end <= end) { + real_removed_size += entry->size; + memset(entry, 0, sizeof(*entry)); continue; } - /* new range is totally covered? */ - if (ei->addr < start && ei_end > end) { - e820_add_region(end, ei_end - end, ei->type); - ei->size = start - ei->addr; + /* Is the new range completely covered? */ + if (entry->addr < start && entry_end > end) { + e820__range_add(end, entry_end - end, entry->type); + entry->size = start - entry->addr; real_removed_size += size; continue; } - /* partially covered */ - final_start = max(start, ei->addr); - final_end = min(end, ei_end); + /* Partially covered: */ + final_start = max(start, entry->addr); + final_end = min(end, entry_end); if (final_start >= final_end) continue; + real_removed_size += final_end - final_start; /* - * left range could be head or tail, so need to update - * size at first. + * Left range could be head or tail, so need to update + * the size first: */ - ei->size -= final_end - final_start; - if (ei->addr < final_start) + entry->size -= final_end - final_start; + if (entry->addr < final_start) continue; - ei->addr = final_end; + + entry->addr = final_end; } return real_removed_size; } -void __init update_e820(void) +void __init e820__update_table_print(void) { - if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map)) + if (e820__update_table(e820_table)) return; - printk(KERN_INFO "e820: modified physical RAM map:\n"); - e820_print_map("modified"); + + pr_info("e820: modified physical RAM map:\n"); + e820__print_table("modified"); } -static void __init update_e820_saved(void) + +static void __init e820__update_table_firmware(void) { - sanitize_e820_map(e820_saved->map, ARRAY_SIZE(e820_saved->map), - &e820_saved->nr_map); + e820__update_table(e820_table_firmware); } + #define MAX_GAP_END 0x100000000ull + /* - * Search for a gap in the e820 memory space from 0 to MAX_GAP_END. + * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB). */ -static int __init e820_search_gap(unsigned long *gapstart, - unsigned long *gapsize) +static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize) { unsigned long long last = MAX_GAP_END; - int i = e820->nr_map; + int i = e820_table->nr_entries; int found = 0; while (--i >= 0) { - unsigned long long start = e820->map[i].addr; - unsigned long long end = start + e820->map[i].size; + unsigned long long start = e820_table->entries[i].addr; + unsigned long long end = start + e820_table->entries[i].size; /* * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true + * fit in 32 bits if this condition is true: */ if (last > end) { unsigned long gap = last - end; @@ -613,12 +586,14 @@ static int __init e820_search_gap(unsigned long *gapstart, } /* - * Search for the biggest gap in the low 32 bits of the e820 - * memory space. We pass this space to PCI to assign MMIO resources - * for hotplug or unconfigured devices in. + * Search for the biggest gap in the low 32 bits of the E820 + * memory space. We pass this space to the PCI subsystem, so + * that it can assign MMIO resources for hotplug or + * unconfigured devices in. + * * Hopefully the BIOS let enough space left. */ -__init void e820_setup_gap(void) +__init void e820__setup_pci_gap(void) { unsigned long gapstart, gapsize; int found; @@ -629,138 +604,143 @@ __init void e820_setup_gap(void) if (!found) { #ifdef CONFIG_X86_64 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR - "e820: cannot find a gap in the 32bit address range\n" - "e820: PCI devices with unassigned 32bit BARs may break!\n"); + pr_err( + "e820: Cannot find an available gap in the 32-bit address range\n" + "e820: PCI devices with unassigned 32-bit BARs may not work!\n"); #else gapstart = 0x10000000; #endif } /* - * e820_reserve_resources_late protect stolen RAM already + * e820__reserve_resources_late() protects stolen RAM already: */ pci_mem_start = gapstart; - printk(KERN_INFO - "e820: [mem %#010lx-%#010lx] available for PCI devices\n", - gapstart, gapstart + gapsize - 1); + pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1); } /* * Called late during init, in free_initmem(). * - * Initial e820 and e820_saved are largish __initdata arrays. - * Copy them to (usually much smaller) dynamically allocated area. - * This is done after all tweaks we ever do to them: - * all functions which modify them are __init functions, - * they won't exist after this point. + * Initial e820_table and e820_table_firmware are largish __initdata arrays. + * + * Copy them to a (usually much smaller) dynamically allocated area that is + * sized precisely after the number of e820 entries. + * + * This is done after we've performed all the fixes and tweaks to the tables. + * All functions which modify them are __init functions, which won't exist + * after free_initmem(). */ -__init void e820_reallocate_tables(void) +__init void e820__reallocate_tables(void) { - struct e820map *n; + struct e820_table *n; int size; - size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820->nr_map; + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries; n = kmalloc(size, GFP_KERNEL); BUG_ON(!n); - memcpy(n, e820, size); - e820 = n; + memcpy(n, e820_table, size); + e820_table = n; - size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820_saved->nr_map; + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries; n = kmalloc(size, GFP_KERNEL); BUG_ON(!n); - memcpy(n, e820_saved, size); - e820_saved = n; + memcpy(n, e820_table_firmware, size); + e820_table_firmware = n; } -/** - * Because of the size limitation of struct boot_params, only first - * 128 E820 memory entries are passed to kernel via - * boot_params.e820_map, others are passed via SETUP_E820_EXT node of - * linked list of struct setup_data, which is parsed here. +/* + * Because of the small fixed size of struct boot_params, only the first + * 128 E820 memory entries are passed to the kernel via boot_params.e820_table, + * the remaining (if any) entries are passed via the SETUP_E820_EXT node of + * struct setup_data, which is parsed here. */ -void __init parse_e820_ext(u64 phys_addr, u32 data_len) +void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) { int entries; - struct e820entry *extmap; + struct boot_e820_entry *extmap; struct setup_data *sdata; sdata = early_memremap(phys_addr, data_len); - entries = sdata->len / sizeof(struct e820entry); - extmap = (struct e820entry *)(sdata->data); - __append_e820_map(extmap, entries); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + entries = sdata->len / sizeof(*extmap); + extmap = (struct boot_e820_entry *)(sdata->data); + + __append_e820_table(extmap, entries); + e820__update_table(e820_table); + early_memunmap(sdata, data_len); - printk(KERN_INFO "e820: extended physical RAM map:\n"); - e820_print_map("extended"); + pr_info("e820: extended physical RAM map:\n"); + e820__print_table("extended"); } -#if defined(CONFIG_X86_64) || \ - (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) -/** +/* * Find the ranges of physical addresses that do not correspond to - * e820 RAM areas and mark the corresponding pages as nosave for - * hibernation (32 bit) or software suspend and suspend to RAM (64 bit). + * E820 RAM areas and register the corresponding pages as 'nosave' for + * hibernation (32-bit) or software suspend and suspend to RAM (64-bit). * - * This function requires the e820 map to be sorted and without any + * This function requires the E820 map to be sorted and without any * overlapping entries. */ -void __init e820_mark_nosave_regions(unsigned long limit_pfn) +void __init e820__register_nosave_regions(unsigned long limit_pfn) { int i; unsigned long pfn = 0; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (pfn < PFN_UP(ei->addr)) - register_nosave_region(pfn, PFN_UP(ei->addr)); + if (pfn < PFN_UP(entry->addr)) + register_nosave_region(pfn, PFN_UP(entry->addr)); - pfn = PFN_DOWN(ei->addr + ei->size); + pfn = PFN_DOWN(entry->addr + entry->size); - if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) - register_nosave_region(PFN_UP(ei->addr), pfn); + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) + register_nosave_region(PFN_UP(entry->addr), pfn); if (pfn >= limit_pfn) break; } } -#endif #ifdef CONFIG_ACPI -/** - * Mark ACPI NVS memory region, so that we can save/restore it during - * hibernation and the subsequent resume. +/* + * Register ACPI NVS memory regions, so that we can save/restore them during + * hibernation and the subsequent resume: */ -static int __init e820_mark_nvs_memory(void) +static int __init e820__register_nvs_regions(void) { int i; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (ei->type == E820_NVS) - acpi_nvs_register(ei->addr, ei->size); + if (entry->type == E820_TYPE_NVS) + acpi_nvs_register(entry->addr, entry->size); } return 0; } -core_initcall(e820_mark_nvs_memory); +core_initcall(e820__register_nvs_regions); #endif /* - * pre allocated 4k and reserved it in memblock and e820_saved + * Allocate the requested number of bytes with the requsted alignment + * and return (the physical address) to the caller. Also register this + * range in the 'firmware' E820 table as a reserved range. + * + * This allows kexec to fake a new mptable, as if it came from the real + * system. */ -u64 __init early_reserve_e820(u64 size, u64 align) +u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) { u64 addr; addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); if (addr) { - e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); - printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n"); - update_e820_saved(); + e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + pr_info("e820: update e820_table_firmware for e820__memblock_alloc_reserved()\n"); + e820__update_table_firmware(); } return addr; @@ -779,22 +759,22 @@ u64 __init early_reserve_e820(u64 size, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type) { int i; unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; unsigned long start_pfn; unsigned long end_pfn; - if (ei->type != type) + if (entry->type != type) continue; - start_pfn = ei->addr >> PAGE_SHIFT; - end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; + start_pfn = entry->addr >> PAGE_SHIFT; + end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT; if (start_pfn >= limit_pfn) continue; @@ -809,18 +789,19 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) if (last_pfn > max_arch_pfn) last_pfn = max_arch_pfn; - printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n", + pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n", last_pfn, max_arch_pfn); return last_pfn; } -unsigned long __init e820_end_of_ram_pfn(void) + +unsigned long __init e820__end_of_ram_pfn(void) { - return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); + return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM); } -unsigned long __init e820_end_of_low_ram_pfn(void) +unsigned long __init e820__end_of_low_ram_pfn(void) { - return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_RAM); + return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM); } static void __init early_panic(char *msg) @@ -831,7 +812,7 @@ static void __init early_panic(char *msg) static int userdef __initdata; -/* "mem=nopentium" disables the 4MB page tables. */ +/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */ static int __init parse_memopt(char *p) { u64 mem_size; @@ -844,17 +825,19 @@ static int __init parse_memopt(char *p) setup_clear_cpu_cap(X86_FEATURE_PSE); return 0; #else - printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n"); + pr_warn("mem=nopentium ignored! (only supported on x86_32)\n"); return -EINVAL; #endif } userdef = 1; mem_size = memparse(p, &p); - /* don't remove all of memory when handling "mem={invalid}" param */ + + /* Don't remove all memory when getting "mem={invalid}" parameter: */ if (mem_size == 0) return -EINVAL; - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); return 0; } @@ -872,12 +855,12 @@ static int __init parse_memmap_one(char *p) #ifdef CONFIG_CRASH_DUMP /* * If we are doing a crash dump, we still need to know - * the real mem size before original memory map is + * the real memory size before the original memory map is * reset. */ - saved_max_pfn = e820_end_of_ram_pfn(); + saved_max_pfn = e820__end_of_ram_pfn(); #endif - e820->nr_map = 0; + e820_table->nr_entries = 0; userdef = 1; return 0; } @@ -890,21 +873,23 @@ static int __init parse_memmap_one(char *p) userdef = 1; if (*p == '@') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_RAM); + e820__range_add(start_at, mem_size, E820_TYPE_RAM); } else if (*p == '#') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_ACPI); + e820__range_add(start_at, mem_size, E820_TYPE_ACPI); } else if (*p == '$') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_RESERVED); + e820__range_add(start_at, mem_size, E820_TYPE_RESERVED); } else if (*p == '!') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_PRAM); - } else - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + e820__range_add(start_at, mem_size, E820_TYPE_PRAM); + } else { + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); + } return *p == '\0' ? 0 : -EINVAL; } + static int __init parse_memmap_opt(char *str) { while (str) { @@ -921,68 +906,97 @@ static int __init parse_memmap_opt(char *str) } early_param("memmap", parse_memmap_opt); -void __init finish_e820_parsing(void) +/* + * Reserve all entries from the bootloader's extensible data nodes list, + * because if present we are going to use it later on to fetch e820 + * entries from it: + */ +void __init e820__reserve_setup_data(void) +{ + struct setup_data *data; + u64 pa_data; + + pa_data = boot_params.hdr.setup_data; + if (!pa_data) + return; + + while (pa_data) { + data = early_memremap(pa_data, sizeof(*data)); + e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + pa_data = data->next; + early_memunmap(data, sizeof(*data)); + } + + e820__update_table(e820_table); + + memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + + pr_info("extended physical RAM map:\n"); + e820__print_table("reserve setup_data"); +} + +/* + * Called after parse_early_param(), after early parameters (such as mem=) + * have been processed, in which case we already have an E820 table filled in + * via the parameter callback function(s), but it's not sorted and printed yet: + */ +void __init e820__finish_early_params(void) { if (userdef) { - if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), - &e820->nr_map) < 0) + if (e820__update_table(e820_table) < 0) early_panic("Invalid user supplied memory map"); - printk(KERN_INFO "e820: user-defined physical RAM map:\n"); - e820_print_map("user"); + pr_info("e820: user-defined physical RAM map:\n"); + e820__print_table("user"); } } -static const char *__init e820_type_to_string(int e820_type) +static const char *__init e820_type_to_string(struct e820_entry *entry) { - switch (e820_type) { - case E820_RESERVED_KERN: - case E820_RAM: return "System RAM"; - case E820_ACPI: return "ACPI Tables"; - case E820_NVS: return "ACPI Non-volatile Storage"; - case E820_UNUSABLE: return "Unusable memory"; - case E820_PRAM: return "Persistent Memory (legacy)"; - case E820_PMEM: return "Persistent Memory"; - default: return "reserved"; + switch (entry->type) { + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: return "System RAM"; + case E820_TYPE_ACPI: return "ACPI Tables"; + case E820_TYPE_NVS: return "ACPI Non-volatile Storage"; + case E820_TYPE_UNUSABLE: return "Unusable memory"; + case E820_TYPE_PRAM: return "Persistent Memory (legacy)"; + case E820_TYPE_PMEM: return "Persistent Memory"; + case E820_TYPE_RESERVED: return "Reserved"; + default: return "Unknown E820 type"; } } -static unsigned long __init e820_type_to_iomem_type(int e820_type) +static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) { - switch (e820_type) { - case E820_RESERVED_KERN: - case E820_RAM: - return IORESOURCE_SYSTEM_RAM; - case E820_ACPI: - case E820_NVS: - case E820_UNUSABLE: - case E820_PRAM: - case E820_PMEM: - default: - return IORESOURCE_MEM; + switch (entry->type) { + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM; + case E820_TYPE_ACPI: /* Fall-through: */ + case E820_TYPE_NVS: /* Fall-through: */ + case E820_TYPE_UNUSABLE: /* Fall-through: */ + case E820_TYPE_PRAM: /* Fall-through: */ + case E820_TYPE_PMEM: /* Fall-through: */ + case E820_TYPE_RESERVED: /* Fall-through: */ + default: return IORESOURCE_MEM; } } -static unsigned long __init e820_type_to_iores_desc(int e820_type) +static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) { - switch (e820_type) { - case E820_ACPI: - return IORES_DESC_ACPI_TABLES; - case E820_NVS: - return IORES_DESC_ACPI_NV_STORAGE; - case E820_PMEM: - return IORES_DESC_PERSISTENT_MEMORY; - case E820_PRAM: - return IORES_DESC_PERSISTENT_MEMORY_LEGACY; - case E820_RESERVED_KERN: - case E820_RAM: - case E820_UNUSABLE: - default: - return IORES_DESC_NONE; + switch (entry->type) { + case E820_TYPE_ACPI: return IORES_DESC_ACPI_TABLES; + case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE; + case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY; + case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: /* Fall-through: */ + case E820_TYPE_UNUSABLE: /* Fall-through: */ + case E820_TYPE_RESERVED: /* Fall-through: */ + default: return IORES_DESC_NONE; } } -static bool __init do_mark_busy(u32 type, struct resource *res) +static bool __init do_mark_busy(enum e820_type type, struct resource *res) { /* this is the legacy bios/dos rom-shadow + mmio region */ if (res->start < (1ULL<<20)) @@ -993,61 +1007,71 @@ static bool __init do_mark_busy(u32 type, struct resource *res) * for exclusive use of a driver */ switch (type) { - case E820_RESERVED: - case E820_PRAM: - case E820_PMEM: + case E820_TYPE_RESERVED: + case E820_TYPE_PRAM: + case E820_TYPE_PMEM: return false; + case E820_TYPE_RESERVED_KERN: + case E820_TYPE_RAM: + case E820_TYPE_ACPI: + case E820_TYPE_NVS: + case E820_TYPE_UNUSABLE: default: return true; } } /* - * Mark e820 reserved areas as busy for the resource manager. + * Mark E820 reserved areas as busy for the resource manager: */ + static struct resource __initdata *e820_res; -void __init e820_reserve_resources(void) + +void __init e820__reserve_resources(void) { int i; struct resource *res; u64 end; - res = alloc_bootmem(sizeof(struct resource) * e820->nr_map); + res = alloc_bootmem(sizeof(*res) * e820_table->nr_entries); e820_res = res; - for (i = 0; i < e820->nr_map; i++) { - end = e820->map[i].addr + e820->map[i].size - 1; + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = e820_table->entries + i; + + end = entry->addr + entry->size - 1; if (end != (resource_size_t)end) { res++; continue; } - res->name = e820_type_to_string(e820->map[i].type); - res->start = e820->map[i].addr; - res->end = end; - - res->flags = e820_type_to_iomem_type(e820->map[i].type); - res->desc = e820_type_to_iores_desc(e820->map[i].type); + res->start = entry->addr; + res->end = end; + res->name = e820_type_to_string(entry); + res->flags = e820_type_to_iomem_type(entry); + res->desc = e820_type_to_iores_desc(entry); /* - * don't register the region that could be conflicted with - * pci device BAR resource and insert them later in - * pcibios_resource_survey() + * Don't register the region that could be conflicted with + * PCI device BAR resources and insert them later in + * pcibios_resource_survey(): */ - if (do_mark_busy(e820->map[i].type, res)) { + if (do_mark_busy(entry->type, res)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } res++; } - for (i = 0; i < e820_saved->nr_map; i++) { - struct e820entry *entry = &e820_saved->map[i]; - firmware_map_add_early(entry->addr, - entry->addr + entry->size, - e820_type_to_string(entry->type)); + for (i = 0; i < e820_table_firmware->nr_entries; i++) { + struct e820_entry *entry = e820_table_firmware->entries + i; + + firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry)); } } -/* How much should we pad RAM ending depending on where it is? */ +/* + * How much should we pad the end of RAM, depending on where it is? + */ static unsigned long __init ram_alignment(resource_size_t pos) { unsigned long mb = pos >> 20; @@ -1066,64 +1090,59 @@ static unsigned long __init ram_alignment(resource_size_t pos) #define MAX_RESOURCE_SIZE ((resource_size_t)-1) -void __init e820_reserve_resources_late(void) +void __init e820__reserve_resources_late(void) { int i; struct resource *res; res = e820_res; - for (i = 0; i < e820->nr_map; i++) { + for (i = 0; i < e820_table->nr_entries; i++) { if (!res->parent && res->end) insert_resource_expand_to_fit(&iomem_resource, res); res++; } /* - * Try to bump up RAM regions to reasonable boundaries to + * Try to bump up RAM regions to reasonable boundaries, to * avoid stolen RAM: */ - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *entry = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; u64 start, end; - if (entry->type != E820_RAM) + if (entry->type != E820_TYPE_RAM) continue; + start = entry->addr + entry->size; end = round_up(start, ram_alignment(start)) - 1; if (end > MAX_RESOURCE_SIZE) end = MAX_RESOURCE_SIZE; if (start >= end) continue; - printk(KERN_DEBUG - "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", - start, end); - reserve_region_with_split(&iomem_resource, start, end, - "RAM buffer"); + + printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end); + reserve_region_with_split(&iomem_resource, start, end, "RAM buffer"); } } -char *__init default_machine_specific_memory_setup(void) +/* + * Pass the firmware (bootloader) E820 map to the kernel and process it: + */ +char *__init e820__memory_setup_default(void) { char *who = "BIOS-e820"; - u32 new_nr; + /* * Try to copy the BIOS-supplied E820-map. * * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ - new_nr = boot_params.e820_entries; - sanitize_e820_map(boot_params.e820_map, - ARRAY_SIZE(boot_params.e820_map), - &new_nr); - boot_params.e820_entries = new_nr; - if (append_e820_map(boot_params.e820_map, boot_params.e820_entries) - < 0) { + if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) { u64 mem_size; - /* compare results from other methods and take the greater */ - if (boot_params.alt_mem_k - < boot_params.screen_info.ext_mem_k) { + /* Compare results from other methods and take the one that gives more RAM: */ + if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) { mem_size = boot_params.screen_info.ext_mem_k; who = "BIOS-88"; } else { @@ -1131,84 +1150,68 @@ char *__init default_machine_specific_memory_setup(void) who = "BIOS-e801"; } - e820->nr_map = 0; - e820_add_region(0, LOWMEMSIZE(), E820_RAM); - e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); + e820_table->nr_entries = 0; + e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM); + e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM); } - /* In case someone cares... */ + /* We just appended a lot of ranges, sanitize the table: */ + e820__update_table(e820_table); + return who; } -void __init setup_memory_map(void) +/* + * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader + * E820 map - with an optional platform quirk available for virtual platforms + * to override this method of boot environment processing: + */ +void __init e820__memory_setup(void) { char *who; + /* This is a firmware interface ABI - make sure we don't break it: */ + BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20); + who = x86_init.resources.memory_setup(); - memcpy(e820_saved, e820, sizeof(struct e820map)); - printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n"); - e820_print_map(who); + + memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + + pr_info("e820: BIOS-provided physical RAM map:\n"); + e820__print_table(who); } -void __init memblock_x86_fill(void) +void __init e820__memblock_setup(void) { int i; u64 end; /* - * EFI may have more than 128 entries - * We are safe to enable resizing, beause memblock_x86_fill() - * is rather later for x86 + * The bootstrap memblock region count maximum is 128 entries + * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries + * than that - so allow memblock resizing. + * + * This is safe, because this call happens pretty late during x86 setup, + * so we know about reserved memory regions already. (This is important + * so that memblock resizing does no stomp over reserved areas.) */ memblock_allow_resize(); - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - end = ei->addr + ei->size; + end = entry->addr + entry->size; if (end != (resource_size_t)end) continue; - if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) continue; - memblock_add(ei->addr, ei->size); + memblock_add(entry->addr, entry->size); } - /* throw away partial pages */ + /* Throw away partial pages: */ memblock_trim_memory(PAGE_SIZE); memblock_dump_all(); } - -void __init memblock_find_dma_reserve(void) -{ -#ifdef CONFIG_X86_64 - u64 nr_pages = 0, nr_free_pages = 0; - unsigned long start_pfn, end_pfn; - phys_addr_t start, end; - int i; - u64 u; - - /* - * need to find out used area below MAX_DMA_PFN - * need to use memblock to get free size in [0, MAX_DMA_PFN] - * at first, and assume boot_mem will not take below MAX_DMA_PFN - */ - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - start_pfn = min(start_pfn, MAX_DMA_PFN); - end_pfn = min(end_pfn, MAX_DMA_PFN); - nr_pages += end_pfn - start_pfn; - } - - for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, - NULL) { - start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); - end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); - if (start_pfn < end_pfn) - nr_free_pages += end_pfn - start_pfn; - } - - set_dma_reserve(nr_pages - nr_free_pages); -#endif -} diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6a08e25a48d8..d907c3d8633f 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -526,6 +526,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_SKL_IDS(&gen9_early_ops), INTEL_BXT_IDS(&gen9_early_ops), INTEL_KBL_IDS(&gen9_early_ops), + INTEL_GLK_IDS(&gen9_early_ops), }; static void __init @@ -546,8 +547,8 @@ intel_graphics_stolen(int num, int slot, int func, &base, &end); /* Mark this space as reserved */ - e820_add_region(base, size, E820_RESERVED); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + e820__range_add(base, size, E820_TYPE_RESERVED); + e820__update_table(e820_table); } static void __init intel_graphics_quirks(int num, int slot, int func) diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 8a121991e5ba..0f0840304452 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -17,6 +17,7 @@ #include <asm/intel-mid.h> #include <asm/pgtable.h> #include <linux/usb/ehci_def.h> +#include <linux/usb/xhci-dbgp.h> #include <linux/efi.h> #include <asm/efi.h> #include <asm/pci_x86.h> @@ -381,6 +382,10 @@ static int __init setup_early_printk(char *buf) if (!strncmp(buf, "efi", 3)) early_console_register(&early_efi_console, keep); #endif +#ifdef CONFIG_EARLY_PRINTK_USB_XDBC + if (!strncmp(buf, "xdbc", 4)) + early_xdbc_parse_parameter(buf + 4); +#endif buf++; } diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 04f89caef9c4..8e598a1ad986 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -50,11 +50,11 @@ #define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) /* There is address space for how many espfix pages? */ -#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) +#define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16)) #define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) #if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS -# error "Need more than one PGD for the ESPFIX hack" +# error "Need more virtual address space for the ESPFIX hack" #endif #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) @@ -121,11 +121,13 @@ static void init_espfix_random(void) void __init init_espfix_bsp(void) { - pgd_t *pgd_p; + pgd_t *pgd; + p4d_t *p4d; /* Install the espfix pud into the kernel page directory */ - pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; - pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); + pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); + p4d_populate(&init_mm, p4d, espfix_pud_page); /* Randomize the locations */ init_espfix_random(); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8639bb2ae058..0651e974dcb3 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -24,7 +24,7 @@ #include <trace/syscall.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/kprobes.h> #include <asm/ftrace.h> #include <asm/nops.h> @@ -533,9 +533,15 @@ static void do_sync_core(void *data) static void run_sync(void) { - int enable_irqs = irqs_disabled(); + int enable_irqs; - /* We may be called with interrupts disbled (on bootup). */ + /* No need to sync if there's only one CPU */ + if (num_online_cpus() == 1) + return; + + enable_irqs = irqs_disabled(); + + /* We may be called with interrupts disabled (on bootup). */ if (enable_irqs) local_irq_enable(); on_each_cpu(do_sync_core, NULL, 1); @@ -983,6 +989,18 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long return_hooker = (unsigned long) &return_to_handler; + /* + * When resuming from suspend-to-ram, this function can be indirectly + * called from early CPU startup code while the CPU is in real mode, + * which would fail miserably. Make sure the stack pointer is a + * virtual address. + * + * This check isn't as accurate as virt_addr_valid(), but it should be + * good enough for this purpose, and it's fast. + */ + if (unlikely((long)__builtin_frame_address(0) >= 0)) + return; + if (unlikely(ftrace_graph_is_dead())) return; diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S new file mode 100644 index 000000000000..722a145b4139 --- /dev/null +++ b/arch/x86/kernel/ftrace_32.S @@ -0,0 +1,244 @@ +/* + * Copyright (C) 2017 Steven Rostedt, VMware Inc. + */ + +#include <linux/linkage.h> +#include <asm/page_types.h> +#include <asm/segment.h> +#include <asm/export.h> +#include <asm/ftrace.h> + +#ifdef CC_USING_FENTRY +# define function_hook __fentry__ +EXPORT_SYMBOL(__fentry__) +#else +# define function_hook mcount +EXPORT_SYMBOL(mcount) +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE + +/* mcount uses a frame pointer even if CONFIG_FRAME_POINTER is not set */ +#if !defined(CC_USING_FENTRY) || defined(CONFIG_FRAME_POINTER) +# define USING_FRAME_POINTER +#endif + +#ifdef USING_FRAME_POINTER +# define MCOUNT_FRAME 1 /* using frame = true */ +#else +# define MCOUNT_FRAME 0 /* using frame = false */ +#endif + +ENTRY(function_hook) + ret +END(function_hook) + +ENTRY(ftrace_caller) + +#ifdef USING_FRAME_POINTER +# ifdef CC_USING_FENTRY + /* + * Frame pointers are of ip followed by bp. + * Since fentry is an immediate jump, we are left with + * parent-ip, function-ip. We need to add a frame with + * parent-ip followed by ebp. + */ + pushl 4(%esp) /* parent ip */ + pushl %ebp + movl %esp, %ebp + pushl 2*4(%esp) /* function ip */ +# endif + /* For mcount, the function ip is directly above */ + pushl %ebp + movl %esp, %ebp +#endif + pushl %eax + pushl %ecx + pushl %edx + pushl $0 /* Pass NULL as regs pointer */ + +#ifdef USING_FRAME_POINTER + /* Load parent ebp into edx */ + movl 4*4(%esp), %edx +#else + /* There's no frame pointer, load the appropriate stack addr instead */ + lea 4*4(%esp), %edx +#endif + + movl (MCOUNT_FRAME+4)*4(%esp), %eax /* load the rip */ + /* Get the parent ip */ + movl 4(%edx), %edx /* edx has ebp */ + + movl function_trace_op, %ecx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + addl $4, %esp /* skip NULL pointer */ + popl %edx + popl %ecx + popl %eax +#ifdef USING_FRAME_POINTER + popl %ebp +# ifdef CC_USING_FENTRY + addl $4,%esp /* skip function ip */ + popl %ebp /* this is the orig bp */ + addl $4, %esp /* skip parent ip */ +# endif +#endif +.Lftrace_ret: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + +/* This is weak to keep gas from relaxing the jumps */ +WEAK(ftrace_stub) + ret +END(ftrace_caller) + +ENTRY(ftrace_regs_caller) + /* + * i386 does not save SS and ESP when coming from kernel. + * Instead, to get sp, ®s->sp is used (see ptrace.h). + * Unfortunately, that means eflags must be at the same location + * as the current return ip is. We move the return ip into the + * regs->ip location, and move flags into the return ip location. + */ + pushl $__KERNEL_CS + pushl 4(%esp) /* Save the return ip */ + pushl $0 /* Load 0 into orig_ax */ + pushl %gs + pushl %fs + pushl %es + pushl %ds + pushl %eax + + /* Get flags and place them into the return ip slot */ + pushf + popl %eax + movl %eax, 8*4(%esp) + + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + movl 12*4(%esp), %eax /* Load ip (1st parameter) */ + subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ +#ifdef CC_USING_FENTRY + movl 15*4(%esp), %edx /* Load parent ip (2nd parameter) */ +#else + movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ +#endif + movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + pushl %esp /* Save pt_regs as 4th parameter */ + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + addl $4, %esp /* Skip pt_regs */ + + /* restore flags */ + push 14*4(%esp) + popf + + /* Move return ip back to its original location */ + movl 12*4(%esp), %eax + movl %eax, 14*4(%esp) + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + popl %ds + popl %es + popl %fs + popl %gs + + /* use lea to not affect flags */ + lea 3*4(%esp), %esp /* Skip orig_ax, ip and cs */ + + jmp .Lftrace_ret +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(function_hook) + cmpl $__PAGE_OFFSET, %esp + jb ftrace_stub /* Paging not enabled yet? */ + + cmpl $ftrace_stub, ftrace_trace_function + jnz .Ltrace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +.Ltrace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + jmp ftrace_stub +END(function_hook) +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 3*4(%esp), %eax + /* Even with frame pointers, fentry doesn't have one here */ +#ifdef CC_USING_FENTRY + lea 4*4(%esp), %edx + movl $0, %ecx +#else + lea 0x4(%ebp), %edx + movl (%ebp), %ecx +#endif + subl $MCOUNT_INSN_SIZE, %eax + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl %eax + pushl %edx +#ifdef CC_USING_FENTRY + movl $0, %eax +#else + movl %ebp, %eax +#endif + call ftrace_return_to_handler + movl %eax, %ecx + popl %edx + popl %eax + jmp *%ecx +#endif diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/ftrace_64.S index 7b0d3da52fb4..1dfac634bbf7 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -1,6 +1,4 @@ /* - * linux/arch/x86_64/mcount_64.S - * * Copyright (C) 2014 Steven Rostedt, Red Hat Inc */ @@ -13,9 +11,6 @@ .code64 .section .entry.text, "ax" - -#ifdef CONFIG_FUNCTION_TRACER - #ifdef CC_USING_FENTRY # define function_hook __fentry__ EXPORT_SYMBOL(__fentry__) @@ -297,7 +292,6 @@ trace: jmp fgraph_trace END(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(ftrace_graph_caller) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index e5fb436a6548..538ec012b371 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -12,7 +12,7 @@ #include <asm/setup.h> #include <asm/sections.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/page.h> #include <asm/apic.h> #include <asm/io_apic.h> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 54a2372f5dbb..43b7002f44fb 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -4,6 +4,7 @@ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE */ +#define DISABLE_BRANCH_PROFILING #include <linux/init.h> #include <linux/linkage.h> #include <linux/types.h> @@ -23,7 +24,7 @@ #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/kdebug.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/bios_ebda.h> #include <asm/bootparam_utils.h> #include <asm/microcode.h> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b467b14b03eb..ac9d327d2e42 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -269,10 +269,8 @@ ENTRY(secondary_startup_64) /* rsi is pointer to real mode structure with interesting info. pass it to C */ movq %rsi, %rdi - jmp start_cpu -ENDPROC(secondary_startup_64) -ENTRY(start_cpu) +.Ljump_to_C_code: /* * Jump to run C code and to be on a real kernel address. * Since we are running on identity-mapped space we have to jump @@ -305,7 +303,7 @@ ENTRY(start_cpu) pushq %rax # target address in negative space lretq .Lafter_lret: -ENDPROC(start_cpu) +ENDPROC(secondary_startup_64) #include "verify_cpu.S" @@ -313,11 +311,11 @@ ENDPROC(start_cpu) /* * Boot CPU0 entry point. It's called from play_dead(). Everything has been set * up already except stack. We just set up stack here. Then call - * start_secondary() via start_cpu(). + * start_secondary() via .Ljump_to_C_code. */ ENTRY(start_cpu0) movq initial_stack(%rip), %rsp - jmp start_cpu + jmp .Ljump_to_C_code ENDPROC(start_cpu0) #endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index dc6ba5bda9fc..89ff7af2de50 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -354,7 +354,7 @@ static int hpet_resume(struct clock_event_device *evt, int timer) irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq)); irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); - disable_irq(hdev->irq); + disable_hardirq(hdev->irq); irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); enable_irq(hdev->irq); } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 4d8183b5f113..f34fe7444836 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -394,6 +394,9 @@ int check_irq_vectors_for_cpu_disable(void) !cpumask_subset(&affinity_new, &online_new)) this_count++; } + /* No need to check any further. */ + if (!this_count) + return 0; count = 0; for_each_online_cpu(cpu) { @@ -411,8 +414,10 @@ int check_irq_vectors_for_cpu_disable(void) for (vector = FIRST_EXTERNAL_VECTOR; vector < first_system_vector; vector++) { if (!test_bit(vector, used_vectors) && - IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) - count++; + IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) { + if (++count == this_count) + return 0; + } } } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 1423ab1b0312..7468c6987547 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -195,7 +195,5 @@ void __init native_init_IRQ(void) if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); -#ifdef CONFIG_X86_32 irq_ctx_init(smp_processor_id()); -#endif } diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index bdb83e431d89..38b64587b31b 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -167,7 +167,7 @@ static int __init boot_params_kdebugfs_init(void) struct dentry *dbp, *version, *data; int error = -ENOMEM; - dbp = debugfs_create_dir("boot_params", NULL); + dbp = debugfs_create_dir("boot_params", arch_debugfs_dir); if (!dbp) return -ENOMEM; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index d0a814a9d96a..9d7fd5e6689a 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -25,6 +25,7 @@ #include <asm/setup.h> #include <asm/crash.h> #include <asm/efi.h> +#include <asm/e820/api.h> #include <asm/kexec-bzimage64.h> #define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ @@ -99,15 +100,14 @@ static int setup_e820_entries(struct boot_params *params) { unsigned int nr_e820_entries; - nr_e820_entries = e820_saved->nr_map; + nr_e820_entries = e820_table_firmware->nr_entries; - /* TODO: Pass entries more than E820MAX in bootparams setup data */ - if (nr_e820_entries > E820MAX) - nr_e820_entries = E820MAX; + /* TODO: Pass entries more than E820_MAX_ENTRIES_ZEROPAGE in bootparams setup data */ + if (nr_e820_entries > E820_MAX_ENTRIES_ZEROPAGE) + nr_e820_entries = E820_MAX_ENTRIES_ZEROPAGE; params->e820_entries = nr_e820_entries; - memcpy(¶ms->e820_map, &e820_saved->map, - nr_e820_entries * sizeof(struct e820entry)); + memcpy(¶ms->e820_table, &e820_table_firmware->entries, nr_e820_entries*sizeof(struct e820_entry)); return 0; } @@ -232,10 +232,10 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, nr_e820_entries = params->e820_entries; for (i = 0; i < nr_e820_entries; i++) { - if (params->e820_map[i].type != E820_RAM) + if (params->e820_table[i].type != E820_TYPE_RAM) continue; - start = params->e820_map[i].addr; - end = params->e820_map[i].addr + params->e820_map[i].size - 1; + start = params->e820_table[i].addr; + end = params->e820_table[i].addr + params->e820_table[i].size - 1; if ((start <= 0x100000) && end > 0x100000) { mem_k = (end >> 10) - (0x100000 >> 10); diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index c6ee63f927ab..db2182d63ed0 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -67,7 +67,7 @@ #endif /* Ensure if the instruction can be boostable */ -extern int can_boost(kprobe_opcode_t *instruction); +extern int can_boost(struct insn *insn, void *orig_addr); /* Recover instruction if given address is probed */ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr); @@ -75,7 +75,7 @@ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, * Copy an instruction and adjust the displacement if the instruction * uses the %rip-relative addressing mode. */ -extern int __copy_instruction(u8 *dest, u8 *src); +extern int __copy_instruction(u8 *dest, u8 *src, struct insn *insn); /* Generate a relative-jump/call instruction */ extern void synthesize_reljump(void *from, void *to); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6384eb754a58..5b2bbfbb3712 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -61,6 +61,7 @@ #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> +#include <asm/set_memory.h> #include "common.h" @@ -164,42 +165,38 @@ static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn) NOKPROBE_SYMBOL(skip_prefixes); /* - * Returns non-zero if opcode is boostable. + * Returns non-zero if INSN is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode */ -int can_boost(kprobe_opcode_t *opcodes) +int can_boost(struct insn *insn, void *addr) { kprobe_opcode_t opcode; - kprobe_opcode_t *orig_opcodes = opcodes; - if (search_exception_tables((unsigned long)opcodes)) + if (search_exception_tables((unsigned long)addr)) return 0; /* Page fault may occur on this address. */ -retry: - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - opcode = *(opcodes++); - /* 2nd-byte opcode */ - if (opcode == 0x0f) { - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - return test_bit(*opcodes, + if (insn->opcode.nbytes == 2) + return test_bit(insn->opcode.bytes[1], (unsigned long *)twobyte_is_boostable); - } + + if (insn->opcode.nbytes != 1) + return 0; + + /* Can't boost Address-size override prefix */ + if (unlikely(inat_is_address_size_prefix(insn->attr))) + return 0; + + opcode = insn->opcode.bytes[0]; switch (opcode & 0xf0) { -#ifdef CONFIG_X86_64 - case 0x40: - goto retry; /* REX prefix is boostable */ -#endif case 0x60: - if (0x63 < opcode && opcode < 0x67) - goto retry; /* prefixes */ - /* can't boost Address-size override and bound */ - return (opcode != 0x62 && opcode != 0x67); + /* can't boost "bound" */ + return (opcode != 0x62); case 0x70: return 0; /* can't boost conditional jump */ + case 0x90: + return opcode != 0x9a; /* can't boost call far */ case 0xc0: /* can't boost software-interruptions */ return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; @@ -210,14 +207,9 @@ retry: /* can boost in/out and absolute jmps */ return ((opcode & 0x04) || opcode == 0xea); case 0xf0: - if ((opcode & 0x0c) == 0 && opcode != 0xf1) - goto retry; /* lock/rep(ne) prefix */ /* clear and set flags are boostable */ return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); default: - /* segment override prefixes are boostable */ - if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) - goto retry; /* prefixes */ /* CS override prefix and call are not boostable */ return (opcode != 0x2e && opcode != 0x9a); } @@ -264,7 +256,10 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) * Fortunately, we know that the original code is the ideal 5-byte * long NOP. */ - memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (probe_kernel_read(buf, (void *)addr, + MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) + return 0UL; + if (faddr) memcpy(buf, ideal_nops[NOP_ATOMIC5], 5); else @@ -276,7 +271,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) * Recover the probed instruction at addr for further analysis. * Caller must lock kprobes by kprobe_mutex, or disable preemption * for preventing to release referencing kprobes. - * Returns zero if the instruction can not get recovered. + * Returns zero if the instruction can not get recovered (or access failed). */ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) { @@ -348,37 +343,36 @@ static int is_IF_modifier(kprobe_opcode_t *insn) } /* - * Copy an instruction and adjust the displacement if the instruction - * uses the %rip-relative addressing mode. - * If it does, Return the address of the 32-bit displacement word. - * If not, return null. - * Only applicable to 64-bit x86. + * Copy an instruction with recovering modified instruction by kprobes + * and adjust the displacement if the instruction uses the %rip-relative + * addressing mode. + * This returns the length of copied instruction, or 0 if it has an error. */ -int __copy_instruction(u8 *dest, u8 *src) +int __copy_instruction(u8 *dest, u8 *src, struct insn *insn) { - struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - int length; unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src); - if (!recovered_insn) + if (!recovered_insn || !insn) + return 0; + + /* This can access kernel text if given address is not recovered */ + if (probe_kernel_read(dest, (void *)recovered_insn, MAX_INSN_SIZE)) return 0; - kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); - insn_get_length(&insn); - length = insn.length; + + kernel_insn_init(insn, dest, MAX_INSN_SIZE); + insn_get_length(insn); /* Another subsystem puts a breakpoint, failed to recover */ - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION) return 0; - memcpy(dest, insn.kaddr, length); #ifdef CONFIG_X86_64 - if (insn_rip_relative(&insn)) { + /* Only x86_64 has RIP relative instructions */ + if (insn_rip_relative(insn)) { s64 newdisp; u8 *disp; - kernel_insn_init(&insn, dest, length); - insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing * mode. Adjust the displacement for the difference between @@ -391,36 +385,57 @@ int __copy_instruction(u8 *dest, u8 *src) * extension of the original signed 32-bit displacement would * have given. */ - newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest; + newdisp = (u8 *) src + (s64) insn->displacement.value + - (u8 *) dest; if ((s64) (s32) newdisp != newdisp) { pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); - pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", src, dest, insn.displacement.value); + pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", + src, dest, insn->displacement.value); return 0; } - disp = (u8 *) dest + insn_offset_displacement(&insn); + disp = (u8 *) dest + insn_offset_displacement(insn); *(s32 *) disp = (s32) newdisp; } #endif - return length; + return insn->length; +} + +/* Prepare reljump right after instruction to boost */ +static void prepare_boost(struct kprobe *p, struct insn *insn) +{ + if (can_boost(insn, p->addr) && + MAX_INSN_SIZE - insn->length >= RELATIVEJUMP_SIZE) { + /* + * These instructions can be executed directly if it + * jumps back to correct address. + */ + synthesize_reljump(p->ainsn.insn + insn->length, + p->addr + insn->length); + p->ainsn.boostable = true; + } else { + p->ainsn.boostable = false; + } } static int arch_copy_kprobe(struct kprobe *p) { - int ret; + struct insn insn; + int len; + + set_memory_rw((unsigned long)p->ainsn.insn & PAGE_MASK, 1); /* Copy an instruction with recovering if other optprobe modifies it.*/ - ret = __copy_instruction(p->ainsn.insn, p->addr); - if (!ret) + len = __copy_instruction(p->ainsn.insn, p->addr, &insn); + if (!len) return -EINVAL; /* * __copy_instruction can modify the displacement of the instruction, * but it doesn't affect boostable check. */ - if (can_boost(p->ainsn.insn)) - p->ainsn.boostable = 0; - else - p->ainsn.boostable = -1; + prepare_boost(p, &insn); + + set_memory_ro((unsigned long)p->ainsn.insn & PAGE_MASK, 1); /* Check whether the instruction modifies Interrupt Flag or not */ p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn); @@ -459,7 +474,7 @@ void arch_disarm_kprobe(struct kprobe *p) void arch_remove_kprobe(struct kprobe *p) { if (p->ainsn.insn) { - free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); + free_insn_slot(p->ainsn.insn, p->ainsn.boostable); p->ainsn.insn = NULL; } } @@ -531,7 +546,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, return; #if !defined(CONFIG_PREEMPT) - if (p->ainsn.boostable == 1 && !p->post_handler) { + if (p->ainsn.boostable && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ if (!reenter) reset_current_kprobe(); @@ -851,7 +866,7 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs, case 0xcf: case 0xea: /* jmp absolute -- ip is correct */ /* ip is already adjusted, no more changes required */ - p->ainsn.boostable = 1; + p->ainsn.boostable = true; goto no_change; case 0xe8: /* call relative - Fix return addr */ *tos = orig_ip + (*tos - copy_ip); @@ -876,28 +891,13 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs, * jmp near and far, absolute indirect * ip is correct. And this is boostable */ - p->ainsn.boostable = 1; + p->ainsn.boostable = true; goto no_change; } default: break; } - if (p->ainsn.boostable == 0) { - if ((regs->ip > copy_ip) && - (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) { - /* - * These instructions can be executed directly if it - * jumps back to correct address. - */ - synthesize_reljump((void *)regs->ip, - (void *)orig_ip + (regs->ip - copy_ip)); - p->ainsn.boostable = 1; - } else { - p->ainsn.boostable = -1; - } - } - regs->ip += orig_ip - copy_ip; no_change: diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 5f8f0b3cc674..041f7b6dfa0f 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -94,6 +94,6 @@ NOKPROBE_SYMBOL(kprobe_ftrace_handler); int arch_prepare_kprobe_ftrace(struct kprobe *p) { p->ainsn.insn = NULL; - p->ainsn.boostable = -1; + p->ainsn.boostable = false; return 0; } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3d1bee9d6a72..901c640d152f 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -37,6 +37,7 @@ #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> +#include <asm/set_memory.h> #include "common.h" @@ -65,7 +66,10 @@ found: * overwritten by jump destination address. In this case, original * bytes must be recovered from op->optinsn.copied_insn buffer. */ - memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (probe_kernel_read(buf, (void *)addr, + MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) + return 0UL; + if (addr == (unsigned long)kp->addr) { buf[0] = kp->opcode; memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); @@ -174,11 +178,12 @@ NOKPROBE_SYMBOL(optimized_callback); static int copy_optimized_instructions(u8 *dest, u8 *src) { + struct insn insn; int len = 0, ret; while (len < RELATIVEJUMP_SIZE) { - ret = __copy_instruction(dest + len, src + len); - if (!ret || !can_boost(dest + len)) + ret = __copy_instruction(dest + len, src + len, &insn); + if (!ret || !can_boost(&insn, src + len)) return -EINVAL; len += ret; } @@ -350,6 +355,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, } buf = (u8 *)op->optinsn.insn; + set_memory_rw((unsigned long)buf & PAGE_MASK, 1); /* Copy instructions into the out-of-line buffer */ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); @@ -372,6 +378,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, (u8 *)op->kp.addr + op->optinsn.size); + set_memory_ro((unsigned long)buf & PAGE_MASK, 1); + flush_icache_range((unsigned long) buf, (unsigned long) buf + TMPL_END_IDX + op->optinsn.size + RELATIVEJUMP_SIZE); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 14f65a5f938e..da5c09789984 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -396,9 +396,9 @@ static u64 kvm_steal_clock(int cpu) src = &per_cpu(steal_time, cpu); do { version = src->version; - rmb(); + virt_rmb(); steal = src->steal; - rmb(); + virt_rmb(); } while ((version & 1) || (version != src->version)); return steal; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 469b23d6acc2..8c53c5d7a1bc 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -23,7 +23,7 @@ #include <asm/io_apic.h> #include <asm/cpufeature.h> #include <asm/desc.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/debugreg.h> static void set_idt(void *newidt, __u16 limit) @@ -103,6 +103,7 @@ static void machine_kexec_page_table_set_one( pgd_t *pgd, pmd_t *pmd, pte_t *pte, unsigned long vaddr, unsigned long paddr) { + p4d_t *p4d; pud_t *pud; pgd += pgd_index(vaddr); @@ -110,7 +111,8 @@ static void machine_kexec_page_table_set_one( if (!(pgd_val(*pgd) & _PAGE_PRESENT)) set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); #endif - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); pmd = pmd_offset(pud, vaddr); if (!(pmd_val(*pmd) & _PAGE_PRESENT)) set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 307b1f4543de..ce640428d6fe 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -27,6 +27,7 @@ #include <asm/debugreg.h> #include <asm/kexec-bzimage64.h> #include <asm/setup.h> +#include <asm/set_memory.h> #ifdef CONFIG_KEXEC_FILE static struct kexec_file_ops *kexec_file_loaders[] = { @@ -36,6 +37,7 @@ static struct kexec_file_ops *kexec_file_loaders[] = { static void free_transition_pgtable(struct kimage *image) { + free_page((unsigned long)image->arch.p4d); free_page((unsigned long)image->arch.pud); free_page((unsigned long)image->arch.pmd); free_page((unsigned long)image->arch.pte); @@ -43,6 +45,7 @@ static void free_transition_pgtable(struct kimage *image) static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -53,13 +56,21 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); pgd += pgd_index(vaddr); if (!pgd_present(*pgd)) { + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); + if (!p4d) + goto err; + image->arch.p4d = p4d; + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } + p4d = p4d_offset(pgd, vaddr); + if (!p4d_present(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) goto err; image->arch.pud = pud; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } - pud = pud_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); if (!pud_present(*pud)) { pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) @@ -194,19 +205,22 @@ static int arch_update_purgatory(struct kimage *image) /* Setup copying of backup region */ if (image->type == KEXEC_TYPE_CRASH) { - ret = kexec_purgatory_get_set_symbol(image, "backup_dest", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_dest", &image->arch.backup_load_addr, sizeof(image->arch.backup_load_addr), 0); if (ret) return ret; - ret = kexec_purgatory_get_set_symbol(image, "backup_src", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_src", &image->arch.backup_src_start, sizeof(image->arch.backup_src_start), 0); if (ret) return ret; - ret = kexec_purgatory_get_set_symbol(image, "backup_sz", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_sz", &image->arch.backup_src_sz, sizeof(image->arch.backup_src_sz), 0); if (ret) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 477ae806c2fa..f67bd3205df7 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -85,7 +85,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), - MODULES_END, GFP_KERNEL | __GFP_HIGHMEM, + MODULES_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); if (p && (kasan_module_alloc(p, size) < 0)) { diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0f8d20497383..0d904d759ff1 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -26,7 +26,7 @@ #include <asm/io_apic.h> #include <asm/proto.h> #include <asm/bios_ebda.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/setup.h> #include <asm/smp.h> @@ -826,10 +826,10 @@ static int __init parse_alloc_mptable_opt(char *p) } early_param("alloc_mptable", parse_alloc_mptable_opt); -void __init early_reserve_e820_mpc_new(void) +void __init e820__memblock_alloc_reserved_mpc_new(void) { if (enable_update_mptable && alloc_mptable) - mpc_new_phys = early_reserve_e820(mpc_new_length, 4); + mpc_new_phys = e820__memblock_alloc_reserved(mpc_new_length, 4); } static int __init update_mp_table(void) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index f088ea4c66e7..446c8aa09b9b 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -166,11 +166,9 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) spin_lock_irqsave(&desc->lock, flags); /* - * most handlers of type NMI_UNKNOWN never return because - * they just assume the NMI is theirs. Just a sanity check - * to manage expectations + * Indicate if there are multiple registrations on the + * internal NMI handler call chains (SERR and IO_CHECK). */ - WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); @@ -224,17 +222,6 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", reason, smp_processor_id()); - /* - * On some machines, PCI SERR line is used to report memory - * errors. EDAC makes use of it. - */ -#if defined(CONFIG_EDAC) - if (edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - if (panic_on_unrecovered_nmi) nmi_panic(regs, "NMI: Not continuing"); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 4797e87b0fb6..3586996fc50d 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -405,9 +405,11 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .alloc_pte = paravirt_nop, .alloc_pmd = paravirt_nop, .alloc_pud = paravirt_nop, + .alloc_p4d = paravirt_nop, .release_pte = paravirt_nop, .release_pmd = paravirt_nop, .release_pud = paravirt_nop, + .release_p4d = paravirt_nop, .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, @@ -430,12 +432,19 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .pmd_val = PTE_IDENT, .make_pmd = PTE_IDENT, -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 .pud_val = PTE_IDENT, .make_pud = PTE_IDENT, + .set_p4d = native_set_p4d, + +#if CONFIG_PGTABLE_LEVELS >= 5 + .p4d_val = PTE_IDENT, + .make_p4d = PTE_IDENT, + .set_pgd = native_set_pgd, -#endif +#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ +#endif /* CONFIG_PGTABLE_LEVELS >= 4 */ #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ .pte_val = PTE_IDENT, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 0c150c06fa5a..fda7867046d0 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1007,9 +1007,8 @@ static void __init calgary_enable_translation(struct pci_dev *dev) writel(cpu_to_be32(val32), target); readl(target); /* flush */ - init_timer(&tbl->watchdog_timer); - tbl->watchdog_timer.function = &calgary_watchdog; - tbl->watchdog_timer.data = (unsigned long)dev; + setup_timer(&tbl->watchdog_timer, &calgary_watchdog, + (unsigned long)dev); mod_timer(&tbl->watchdog_timer, jiffies); } diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index d5f15c3f7b25..963e3fb56437 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -14,7 +14,7 @@ #include <asm/probe_roms.h> #include <asm/pci-direct.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mmzone.h> #include <asm/setup.h> #include <asm/sections.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f67591561711..0bb88428cbf2 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -37,6 +37,7 @@ #include <asm/vm86.h> #include <asm/switch_to.h> #include <asm/desc.h> +#include <asm/prctl.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -124,11 +125,6 @@ void flush_thread(void) fpu__clear(&tsk->thread.fpu); } -static void hard_disable_TSC(void) -{ - cr4_set_bits(X86_CR4_TSD); -} - void disable_TSC(void) { preempt_disable(); @@ -137,15 +133,10 @@ void disable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_disable_TSC(); + cr4_set_bits(X86_CR4_TSD); preempt_enable(); } -static void hard_enable_TSC(void) -{ - cr4_clear_bits(X86_CR4_TSD); -} - static void enable_TSC(void) { preempt_disable(); @@ -154,7 +145,7 @@ static void enable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_enable_TSC(); + cr4_clear_bits(X86_CR4_TSD); preempt_enable(); } @@ -182,54 +173,129 @@ int set_tsc_mode(unsigned int val) return 0; } -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss) -{ - struct thread_struct *prev, *next; - - prev = &prev_p->thread; - next = &next_p->thread; +DEFINE_PER_CPU(u64, msr_misc_features_shadow); - if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ - test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); +static void set_cpuid_faulting(bool on) +{ + u64 msrval; - debugctl &= ~DEBUGCTLMSR_BTF; - if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) - debugctl |= DEBUGCTLMSR_BTF; + msrval = this_cpu_read(msr_misc_features_shadow); + msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; + msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); + this_cpu_write(msr_misc_features_shadow, msrval); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); +} - update_debugctlmsr(debugctl); +static void disable_cpuid(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(true); } + preempt_enable(); +} - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ - test_tsk_thread_flag(next_p, TIF_NOTSC)) { - /* prev and next are different */ - if (test_tsk_thread_flag(next_p, TIF_NOTSC)) - hard_disable_TSC(); - else - hard_enable_TSC(); +static void enable_cpuid(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(false); } + preempt_enable(); +} + +static int get_cpuid_mode(void) +{ + return !test_thread_flag(TIF_NOCPUID); +} + +static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) +{ + if (!static_cpu_has(X86_FEATURE_CPUID_FAULT)) + return -ENODEV; + + if (cpuid_enabled) + enable_cpuid(); + else + disable_cpuid(); + + return 0; +} + +/* + * Called immediately after a successful exec. + */ +void arch_setup_new_exec(void) +{ + /* If cpuid was previously disabled for this task, re-enable it. */ + if (test_thread_flag(TIF_NOCPUID)) + enable_cpuid(); +} - if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { +static inline void switch_to_bitmap(struct tss_struct *tss, + struct thread_struct *prev, + struct thread_struct *next, + unsigned long tifp, unsigned long tifn) +{ + if (tifn & _TIF_IO_BITMAP) { /* * Copy the relevant range of the IO bitmap. * Normally this is 128 bytes or less: */ memcpy(tss->io_bitmap, next->io_bitmap_ptr, max(prev->io_bitmap_max, next->io_bitmap_max)); - /* * Make sure that the TSS limit is correct for the CPU * to notice the IO bitmap. */ refresh_tss_limit(); - } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { + } else if (tifp & _TIF_IO_BITMAP) { /* * Clear any possible leftover bits: */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + unsigned long tifp, tifn; + + prev = &prev_p->thread; + next = &next_p->thread; + + tifn = READ_ONCE(task_thread_info(next_p)->flags); + tifp = READ_ONCE(task_thread_info(prev_p)->flags); + switch_to_bitmap(tss, prev, next, tifp, tifn); + propagate_user_return_notify(prev_p, next_p); + + if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && + arch_has_block_step()) { + unsigned long debugctl, msk; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl &= ~DEBUGCTLMSR_BTF; + msk = tifn & _TIF_BLOCKSTEP; + debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + } + + if ((tifp ^ tifn) & _TIF_NOTSC) + cr4_toggle_bits(X86_CR4_TSD); + + if ((tifp ^ tifn) & _TIF_NOCPUID) + set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); } /* @@ -550,3 +616,16 @@ out: put_task_stack(p); return ret; } + +long do_arch_prctl_common(struct task_struct *task, int option, + unsigned long cpuid_enabled) +{ + switch (option) { + case ARCH_GET_CPUID: + return get_cpuid_mode(); + case ARCH_SET_CPUID: + return set_cpuid_mode(task, cpuid_enabled); + } + + return -EINVAL; +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4c818f8bc135..ff40e74c9181 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/kdebug.h> +#include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/ldt.h> @@ -56,6 +57,7 @@ #include <asm/switch_to.h> #include <asm/vm86.h> #include <asm/intel_rdt.h> +#include <asm/proto.h> void __show_regs(struct pt_regs *regs, int all) { @@ -304,3 +306,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } + +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return do_arch_prctl_common(current, option, arg2); +} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index d6b784a5520d..b6840bf3940b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/ftrace.h> +#include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -52,6 +53,11 @@ #include <asm/xen/hypervisor.h> #include <asm/vdso.h> #include <asm/intel_rdt.h> +#include <asm/unistd.h> +#ifdef CONFIG_IA32_EMULATION +/* Not included via unistd.h */ +#include <asm/unistd_32_ia32.h> +#endif __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); @@ -204,7 +210,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, (struct user_desc __user *)tls, 0); else #endif - err = do_arch_prctl(p, ARCH_SET_FS, tls); + err = do_arch_prctl_64(p, ARCH_SET_FS, tls); if (err) goto out; } @@ -440,7 +446,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV /* * On Xen PV, IOPL bits in pt_regs->flags have no effect, and * current_pt_regs()->flags may not match the current task's @@ -493,6 +499,8 @@ void set_personality_64bit(void) clear_thread_flag(TIF_IA32); clear_thread_flag(TIF_ADDR32); clear_thread_flag(TIF_X32); + /* Pretend that this comes from a 64bit execve */ + task_pt_regs(current)->orig_ax = __NR_execve; /* Ensure the corresponding mm is not marked. */ if (current->mm) @@ -505,32 +513,50 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } -void set_personality_ia32(bool x32) +static void __set_personality_x32(void) { - /* inherit personality from parent */ +#ifdef CONFIG_X86_X32 + clear_thread_flag(TIF_IA32); + set_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_X32; + current->personality &= ~READ_IMPLIES_EXEC; + /* + * in_compat_syscall() uses the presence of the x32 syscall bit + * flag to determine compat status. The x86 mmap() code relies on + * the syscall bitness so set x32 syscall bit right here to make + * in_compat_syscall() work during exec(). + * + * Pretend to come from a x32 execve. + */ + task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; + current->thread.status &= ~TS_COMPAT; +#endif +} + +static void __set_personality_ia32(void) +{ +#ifdef CONFIG_IA32_EMULATION + set_thread_flag(TIF_IA32); + clear_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_IA32; + current->personality |= force_personality32; + /* Prepare the first "return" to user space */ + task_pt_regs(current)->orig_ax = __NR_ia32_execve; + current->thread.status |= TS_COMPAT; +#endif +} +void set_personality_ia32(bool x32) +{ /* Make sure to be in 32bit mode */ set_thread_flag(TIF_ADDR32); - /* Mark the associated mm as containing 32-bit tasks. */ - if (x32) { - clear_thread_flag(TIF_IA32); - set_thread_flag(TIF_X32); - if (current->mm) - current->mm->context.ia32_compat = TIF_X32; - current->personality &= ~READ_IMPLIES_EXEC; - /* in_compat_syscall() uses the presence of the x32 - syscall bit flag to determine compat status */ - current->thread.status &= ~TS_COMPAT; - } else { - set_thread_flag(TIF_IA32); - clear_thread_flag(TIF_X32); - if (current->mm) - current->mm->context.ia32_compat = TIF_IA32; - current->personality |= force_personality32; - /* Prepare the first "return" to user space */ - current->thread.status |= TS_COMPAT; - } + if (x32) + __set_personality_x32(); + else + __set_personality_ia32(); } EXPORT_SYMBOL_GPL(set_personality_ia32); @@ -547,70 +573,72 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) } #endif -long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) +long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { int ret = 0; int doit = task == current; int cpu; - switch (code) { + switch (option) { case ARCH_SET_GS: - if (addr >= TASK_SIZE_MAX) + if (arg2 >= TASK_SIZE_MAX) return -EPERM; cpu = get_cpu(); task->thread.gsindex = 0; - task->thread.gsbase = addr; + task->thread.gsbase = arg2; if (doit) { load_gs_index(0); - ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); + ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2); } put_cpu(); break; case ARCH_SET_FS: /* Not strictly needed for fs, but do it for symmetry with gs */ - if (addr >= TASK_SIZE_MAX) + if (arg2 >= TASK_SIZE_MAX) return -EPERM; cpu = get_cpu(); task->thread.fsindex = 0; - task->thread.fsbase = addr; + task->thread.fsbase = arg2; if (doit) { /* set the selector to 0 to not confuse __switch_to */ loadsegment(fs, 0); - ret = wrmsrl_safe(MSR_FS_BASE, addr); + ret = wrmsrl_safe(MSR_FS_BASE, arg2); } put_cpu(); break; case ARCH_GET_FS: { unsigned long base; + if (doit) rdmsrl(MSR_FS_BASE, base); else base = task->thread.fsbase; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)arg2); break; } case ARCH_GET_GS: { unsigned long base; + if (doit) rdmsrl(MSR_KERNEL_GS_BASE, base); else base = task->thread.gsbase; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)arg2); break; } #ifdef CONFIG_CHECKPOINT_RESTORE # ifdef CONFIG_X86_X32_ABI case ARCH_MAP_VDSO_X32: - return prctl_map_vdso(&vdso_image_x32, addr); + return prctl_map_vdso(&vdso_image_x32, arg2); # endif # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case ARCH_MAP_VDSO_32: - return prctl_map_vdso(&vdso_image_32, addr); + return prctl_map_vdso(&vdso_image_32, arg2); # endif case ARCH_MAP_VDSO_64: - return prctl_map_vdso(&vdso_image_64, addr); + return prctl_map_vdso(&vdso_image_64, arg2); #endif default: @@ -621,10 +649,23 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) return ret; } -long sys_arch_prctl(int code, unsigned long addr) +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl(current, code, addr); + long ret; + + ret = do_arch_prctl_64(current, option, arg2); + if (ret == -EINVAL) + ret = do_arch_prctl_common(current, option, arg2); + + return ret; +} + +#ifdef CONFIG_IA32_EMULATION +COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return do_arch_prctl_common(current, option, arg2); } +#endif unsigned long KSTK_ESP(struct task_struct *task) { diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2364b23ea3e5..f37d18124648 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -396,12 +396,12 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; /* - * When changing the segment base, use do_arch_prctl + * When changing the segment base, use do_arch_prctl_64 * to set either thread.fs or thread.fsindex and the * corresponding GDT slot. */ if (child->thread.fsbase != value) - return do_arch_prctl(child, ARCH_SET_FS, value); + return do_arch_prctl_64(child, ARCH_SET_FS, value); return 0; case offsetof(struct user_regs_struct,gs_base): /* @@ -410,7 +410,7 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; if (child->thread.gsbase != value) - return do_arch_prctl(child, ARCH_SET_GS, value); + return do_arch_prctl_64(child, ARCH_SET_GS, value); return 0; #endif } @@ -869,7 +869,7 @@ long arch_ptrace(struct task_struct *child, long request, Works just like arch_prctl, except that the arguments are reversed. */ case PTRACE_ARCH_PRCTL: - ret = do_arch_prctl(child, data, addr); + ret = do_arch_prctl_64(child, data, addr); break; #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index e244c19a2451..2544700a2a87 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -223,6 +223,22 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, + { /* Handle problems with rebooting on ASUS EeeBook X205TA */ + .callback = set_acpi_reboot, + .ident = "ASUS EeeBook X205TA", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "X205TA"), + }, + }, + { /* Handle problems with rebooting on ASUS EeeBook X205TAW */ + .callback = set_acpi_reboot, + .ident = "ASUS EeeBook X205TAW", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "X205TAW"), + }, + }, /* Certec */ { /* Handle problems with rebooting on Certec BPC600 */ @@ -749,10 +765,11 @@ void machine_crash_shutdown(struct pt_regs *regs) #endif +/* This is the CPU performing the emergency shutdown work. */ +int crashing_cpu = -1; + #if defined(CONFIG_SMP) -/* This keeps a track of which one is crashing cpu. */ -static int crashing_cpu; static nmi_shootdown_cb shootdown_callback; static atomic_t waiting_for_crash_ipi; diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 2408c1603438..5ab3895516ac 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -1,5 +1,5 @@ #include <linux/ioport.h> -#include <asm/e820.h> +#include <asm/e820/api.h> static void resource_clip(struct resource *res, resource_size_t start, resource_size_t end) @@ -25,10 +25,10 @@ static void resource_clip(struct resource *res, resource_size_t start, static void remove_e820_regions(struct resource *avail) { int i; - struct e820entry *entry; + struct e820_entry *entry; - for (i = 0; i < e820->nr_map; i++) { - entry = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + entry = &e820_table->entries[i]; resource_clip(avail, entry->addr, entry->addr + entry->size - 1); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4bf0c8926a1c..603a1669a2ec 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -70,12 +70,13 @@ #include <linux/tboot.h> #include <linux/jiffies.h> +#include <linux/usb/xhci-dbgp.h> #include <video/edid.h> #include <asm/mtrr.h> #include <asm/apic.h> #include <asm/realmode.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/efi.h> @@ -119,7 +120,7 @@ * max_low_pfn_mapped: highest direct mapped pfn under 4GB * max_pfn_mapped: highest direct mapped pfn over 4GB * - * The direct mapping only covers E820_RAM regions, so the ranges and gaps are + * The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are * represented by pfn_mapped */ unsigned long max_low_pfn_mapped; @@ -173,14 +174,11 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 -/* cpu data as detected by the assembly code in head.S */ -struct cpuinfo_x86 new_cpu_data = { - .wp_works_ok = -1, -}; +/* cpu data as detected by the assembly code in head_32.S */ +struct cpuinfo_x86 new_cpu_data; + /* common cpu data for all cpus */ -struct cpuinfo_x86 boot_cpu_data __read_mostly = { - .wp_works_ok = -1, -}; +struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); unsigned int def_to_bigsmp; @@ -426,7 +424,7 @@ static void __init parse_setup_data(void) switch (data_type) { case SETUP_E820_EXT: - parse_e820_ext(pa_data, data_len); + e820__memory_setup_extended(pa_data, data_len); break; case SETUP_DTB: add_dtb(pa_data); @@ -441,29 +439,6 @@ static void __init parse_setup_data(void) } } -static void __init e820_reserve_setup_data(void) -{ - struct setup_data *data; - u64 pa_data; - - pa_data = boot_params.hdr.setup_data; - if (!pa_data) - return; - - while (pa_data) { - data = early_memremap(pa_data, sizeof(*data)); - e820_update_range(pa_data, sizeof(*data)+data->len, - E820_RAM, E820_RESERVED_KERN); - pa_data = data->next; - early_memunmap(data, sizeof(*data)); - } - - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); - memcpy(e820_saved, e820, sizeof(struct e820map)); - printk(KERN_INFO "extended physical RAM map:\n"); - e820_print_map("reserve setup_data"); -} - static void __init memblock_x86_reserve_range_setup_data(void) { struct setup_data *data; @@ -756,16 +731,16 @@ static void __init trim_bios_range(void) * since some BIOSes are known to corrupt low memory. See the * Kconfig help text for X86_RESERVE_LOW. */ - e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); /* * special case: Some BIOSen report the PC BIOS * area (640->1Mb) as ram even though it is not. * take them out. */ - e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); + e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + e820__update_table(e820_table); } /* called before trim_bios_range() to spare extra sanitize */ @@ -775,18 +750,18 @@ static void __init e820_add_kernel_range(void) u64 size = __pa_symbol(_end) - start; /* - * Complain if .text .data and .bss are not marked as E820_RAM and + * Complain if .text .data and .bss are not marked as E820_TYPE_RAM and * attempt to fix it by adding the range. We may have a confused BIOS, * or the user may have used memmap=exactmap or memmap=xxM$yyM to * exclude kernel range. If we really are running on top non-RAM, * we will crash later anyways. */ - if (e820_all_mapped(start, start + size, E820_RAM)) + if (e820__mapped_all(start, start + size, E820_TYPE_RAM)) return; - pr_warn(".text .data .bss are not marked as E820_RAM!\n"); - e820_remove_range(start, size, E820_RAM, 0); - e820_add_region(start, size, E820_RAM); + pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n"); + e820__range_remove(start, size, E820_TYPE_RAM, 0); + e820__range_add(start, size, E820_TYPE_RAM); } static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; @@ -837,6 +812,26 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } +static void __init simple_udelay_calibration(void) +{ + unsigned int tsc_khz, cpu_khz; + unsigned long lpj; + + if (!boot_cpu_has(X86_FEATURE_TSC)) + return; + + cpu_khz = x86_platform.calibrate_cpu(); + tsc_khz = x86_platform.calibrate_tsc(); + + tsc_khz = tsc_khz ? : cpu_khz; + if (!tsc_khz) + return; + + lpj = tsc_khz * 1000; + do_div(lpj, HZ); + loops_per_jiffy = lpj; +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -939,7 +934,7 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; - setup_memory_map(); + e820__memory_setup(); parse_setup_data(); copy_edd(); @@ -985,6 +980,8 @@ void __init setup_arch(char **cmdline_p) */ x86_configure_nx(); + simple_udelay_calibration(); + parse_early_param(); #ifdef CONFIG_MEMORY_HOTPLUG @@ -1028,9 +1025,8 @@ void __init setup_arch(char **cmdline_p) early_dump_pci_devices(); #endif - /* update the e820_saved too */ - e820_reserve_setup_data(); - finish_e820_parsing(); + e820__reserve_setup_data(); + e820__finish_early_params(); if (efi_enabled(EFI_BOOT)) efi_init(); @@ -1056,11 +1052,11 @@ void __init setup_arch(char **cmdline_p) trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { - e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, - E820_RESERVED); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM, + E820_TYPE_RESERVED); + e820__update_table(e820_table); printk(KERN_INFO "fixed physical RAM map:\n"); - e820_print_map("bad_ppro"); + e820__print_table("bad_ppro"); } #else early_gart_iommu_check(); @@ -1070,12 +1066,12 @@ void __init setup_arch(char **cmdline_p) * partially used pages are not usable - thus * we are rounding upwards: */ - max_pfn = e820_end_of_ram_pfn(); + max_pfn = e820__end_of_ram_pfn(); /* update e820 for memory not covered by WB MTRRs */ mtrr_bp_init(); if (mtrr_trim_uncached_memory(max_pfn)) - max_pfn = e820_end_of_ram_pfn(); + max_pfn = e820__end_of_ram_pfn(); max_possible_pfn = max_pfn; @@ -1094,7 +1090,7 @@ void __init setup_arch(char **cmdline_p) /* How many end-of-memory variables you have, grandma! */ /* need this before calling reserve_initrd */ if (max_pfn > (1UL<<(32 - PAGE_SHIFT))) - max_low_pfn = e820_end_of_low_ram_pfn(); + max_low_pfn = e820__end_of_low_ram_pfn(); else max_low_pfn = max_pfn; @@ -1111,7 +1107,7 @@ void __init setup_arch(char **cmdline_p) early_alloc_pgt_buf(); /* - * Need to conclude brk, before memblock_x86_fill() + * Need to conclude brk, before e820__memblock_setup() * it could use memblock_find_in_range, could overlap with * brk area. */ @@ -1120,7 +1116,10 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); memblock_set_current_limit(ISA_END_ADDRESS); - memblock_x86_fill(); + e820__memblock_setup(); + + if (!early_xdbc_setup_hardware()) + early_xdbc_register_console(); reserve_bios_regions(); @@ -1137,7 +1136,7 @@ void __init setup_arch(char **cmdline_p) } /* preallocate 4k for mptable mpc */ - early_reserve_e820_mpc_new(); + e820__memblock_alloc_reserved_mpc_new(); #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION setup_bios_corruption_check(); @@ -1226,21 +1225,6 @@ void __init setup_arch(char **cmdline_p) kasan_init(); -#ifdef CONFIG_X86_32 - /* sync back kernel address range */ - clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); - - /* - * sync back low identity map too. It is used for example - * in the 32-bit EFI stub. - */ - clone_pgd_range(initial_page_table, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); -#endif - tboot_probe(); map_vsyscall(); @@ -1275,12 +1259,12 @@ void __init setup_arch(char **cmdline_p) kvm_guest_init(); - e820_reserve_resources(); - e820_mark_nosave_regions(max_low_pfn); + e820__reserve_resources(); + e820__register_nosave_regions(max_low_pfn); x86_init.resources.reserve_resources(); - e820_setup_gap(); + e820__setup_pci_gap(); #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9820d6d977c6..bb1e8cc0bc84 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -160,7 +160,7 @@ static inline void setup_percpu_segment(int cpu) pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, 0x2 | DESCTYPE_S, 0x8); gdt.s = 1; - write_gdt_entry(get_cpu_gdt_table(cpu), + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); #endif } @@ -288,4 +288,25 @@ void __init setup_per_cpu_areas(void) /* Setup cpu initialized, callin, callout masks */ setup_cpu_local_masks(); + +#ifdef CONFIG_X86_32 + /* + * Sync back kernel address range. We want to make sure that + * all kernel mappings, including percpu mappings, are available + * in the smpboot asm. We can't reliably pick up percpu + * mappings using vmalloc_fault(), because exception dispatch + * needs percpu data. + */ + clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + /* + * sync back low identity map too. It is used for example + * in the 32-bit EFI stub. + */ + clone_pgd_range(initial_page_table, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); +#endif } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 396c042e9d0e..cc30a74e4adb 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -846,7 +846,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); - print_vma_addr(" in ", regs->ip); + print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index ec1f756f9dc9..71beb28600d4 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -151,8 +151,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, if (from->si_signo == SIGSEGV) { if (from->si_code == SEGV_BNDERR) { - compat_uptr_t lower = (unsigned long)&to->si_lower; - compat_uptr_t upper = (unsigned long)&to->si_upper; + compat_uptr_t lower = (unsigned long)from->si_lower; + compat_uptr_t upper = (unsigned long)from->si_upper; put_user_ex(lower, &to->si_lower); put_user_ex(upper, &to->si_upper); } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d3c66a15bbde..d798c0da451c 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -33,6 +33,7 @@ #include <asm/mce.h> #include <asm/trace/irq_vectors.h> #include <asm/kexec.h> +#include <asm/virtext.h> /* * Some notes on x86 processor bugs affecting SMP operation: @@ -124,7 +125,7 @@ static bool smp_no_nmi_ipi = false; static void native_smp_send_reschedule(int cpu) { if (unlikely(cpu_is_offline(cpu))) { - WARN_ON(1); + WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); return; } apic->send_IPI(cpu, RESCHEDULE_VECTOR); @@ -162,6 +163,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) return NMI_HANDLED; + cpu_emergency_vmxoff(); stop_this_cpu(NULL); return NMI_HANDLED; @@ -174,6 +176,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) asmlinkage __visible void smp_reboot_interrupt(void) { ipi_entering_ack_irq(); + cpu_emergency_vmxoff(); stop_this_cpu(NULL); irq_exit(); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bd1f1ad35284..f04479a8f74f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -983,7 +983,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) unsigned long timeout; idle->thread.sp = (unsigned long)task_pt_regs(idle); - early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); + early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_code = (unsigned long)start_secondary; initial_stack = idle->thread.sp; diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 8e2b79b88e51..8dabd7bf1673 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -76,6 +76,101 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); +#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE + +#define STACKTRACE_DUMP_ONCE(task) ({ \ + static bool __section(.data.unlikely) __dumped; \ + \ + if (!__dumped) { \ + __dumped = true; \ + WARN_ON(1); \ + show_stack(task, NULL); \ + } \ +}) + +static int __save_stack_trace_reliable(struct stack_trace *trace, + struct task_struct *task) +{ + struct unwind_state state; + struct pt_regs *regs; + unsigned long addr; + + for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + + regs = unwind_get_entry_regs(&state); + if (regs) { + /* + * Kernel mode registers on the stack indicate an + * in-kernel interrupt or exception (e.g., preemption + * or a page fault), which can make frame pointers + * unreliable. + */ + if (!user_mode(regs)) + return -EINVAL; + + /* + * The last frame contains the user mode syscall + * pt_regs. Skip it and finish the unwind. + */ + unwind_next_frame(&state); + if (!unwind_done(&state)) { + STACKTRACE_DUMP_ONCE(task); + return -EINVAL; + } + break; + } + + addr = unwind_get_return_address(&state); + + /* + * A NULL or invalid return address probably means there's some + * generated code which __kernel_text_address() doesn't know + * about. + */ + if (!addr) { + STACKTRACE_DUMP_ONCE(task); + return -EINVAL; + } + + if (save_stack_address(trace, addr, false)) + return -EINVAL; + } + + /* Check for stack corruption */ + if (unwind_error(&state)) { + STACKTRACE_DUMP_ONCE(task); + return -EINVAL; + } + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; + + return 0; +} + +/* + * This function returns an error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is reliable. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +int save_stack_trace_tsk_reliable(struct task_struct *tsk, + struct stack_trace *trace) +{ + int ret; + + if (!try_get_task_stack(tsk)) + return -EINVAL; + + ret = __save_stack_trace_reliable(trace, tsk); + + put_task_stack(tsk); + + return ret; +} +#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ + /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ struct stack_frame_user { @@ -138,4 +233,3 @@ void save_stack_trace_user(struct stack_trace *trace) if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } - diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 50215a4b9347..207b8f2582c7 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -17,6 +17,8 @@ #include <linux/uaccess.h> #include <linux/elf.h> +#include <asm/elf.h> +#include <asm/compat.h> #include <asm/ia32.h> #include <asm/syscalls.h> @@ -101,7 +103,7 @@ out: static void find_start_end(unsigned long flags, unsigned long *begin, unsigned long *end) { - if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) { + if (!in_compat_syscall() && (flags & MAP_32BIT)) { /* This is usually used needed to map code in small model, so it needs to be in the first 31bit. Limit it to that. This means we need to move the @@ -114,10 +116,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin, if (current->flags & PF_RANDOMIZE) { *begin = randomize_page(*begin, 0x02000000); } - } else { - *begin = current->mm->mmap_legacy_base; - *end = TASK_SIZE; + return; } + + *begin = get_mmap_base(1); + *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit(); } unsigned long @@ -176,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; /* for MAP_32BIT mappings we force the legacy mmap base */ - if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) + if (!in_compat_syscall() && (flags & MAP_32BIT)) goto bottomup; /* requesting a specific address */ @@ -191,7 +194,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; - info.high_limit = mm->mmap_base; + info.high_limit = get_mmap_base(0); info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; if (filp) { diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b868fa1b812b..4b1724059909 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -42,7 +42,7 @@ #include <asm/fixmap.h> #include <asm/proto.h> #include <asm/setup.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/io.h> #include "../realmode/rm/wakeup.h" @@ -68,9 +68,9 @@ void __init tboot_probe(void) * also verify that it is mapped as we expect it before calling * set_fixmap(), to reduce chance of garbage value causing crash */ - if (!e820_any_mapped(boot_params.tboot_addr, - boot_params.tboot_addr, E820_RESERVED)) { - pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n"); + if (!e820__mapped_any(boot_params.tboot_addr, + boot_params.tboot_addr, E820_TYPE_RESERVED)) { + pr_warning("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n"); return; } @@ -118,12 +118,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pgprot_t prot) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; pgd = pgd_offset(&tboot_mm, vaddr); - pud = pud_alloc(&tboot_mm, pgd, vaddr); + p4d = p4d_alloc(&tboot_mm, pgd, vaddr); + if (!p4d) + return -1; + pud = pud_alloc(&tboot_mm, p4d, vaddr); if (!pud) return -1; pmd = pmd_alloc(&tboot_mm, pud, vaddr); @@ -188,12 +192,12 @@ static int tboot_setup_sleep(void) tboot->num_mac_regions = 0; - for (i = 0; i < e820->nr_map; i++) { - if ((e820->map[i].type != E820_RAM) - && (e820->map[i].type != E820_RESERVED_KERN)) + for (i = 0; i < e820_table->nr_entries; i++) { + if ((e820_table->entries[i].type != E820_TYPE_RAM) + && (e820_table->entries[i].type != E820_TYPE_RESERVED_KERN)) continue; - add_mac_region(e820->map[i].addr, e820->map[i].size); + add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size); } tboot->acpi_sinfo.kernel_s3_resume_vector = @@ -510,6 +514,9 @@ int tboot_force_iommu(void) if (!tboot_enabled()) return 0; + if (!intel_iommu_tboot_noforce) + return 1; + if (no_iommu || swiotlb || dmar_disabled) pr_warning("Forcing Intel-IOMMU to enabled\n"); diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 6c8934406dc9..dcd699baea1b 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -92,10 +92,17 @@ static void set_tls_desc(struct task_struct *p, int idx, cpu = get_cpu(); while (n-- > 0) { - if (LDT_empty(info) || LDT_zero(info)) + if (LDT_empty(info) || LDT_zero(info)) { desc->a = desc->b = 0; - else + } else { fill_ldt(desc, info); + + /* + * Always set the accessed bit so that the CPU + * doesn't try to write to the (read-only) GDT. + */ + desc->type |= 1; + } ++info; ++desc; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 948443e115c1..3995d3a777d4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -169,6 +169,37 @@ void ist_end_non_atomic(void) preempt_disable(); } +int is_valid_bugaddr(unsigned long addr) +{ + unsigned short ud; + + if (addr < TASK_SIZE_MAX) + return 0; + + if (probe_kernel_address((unsigned short *)addr, ud)) + return 0; + + return ud == INSN_UD0 || ud == INSN_UD2; +} + +static int fixup_bug(struct pt_regs *regs, int trapnr) +{ + if (trapnr != X86_TRAP_UD) + return 0; + + switch (report_bug(regs->ip, regs)) { + case BUG_TRAP_TYPE_NONE: + case BUG_TRAP_TYPE_BUG: + break; + + case BUG_TRAP_TYPE_WARN: + regs->ip += LEN_UD0; + return 1; + } + + return 0; +} + static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, struct pt_regs *regs, long error_code) @@ -187,12 +218,15 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, } if (!user_mode(regs)) { - if (!fixup_exception(regs, trapnr)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = trapnr; - die(str, regs, error_code); - } - return 0; + if (fixup_exception(regs, trapnr)) + return 0; + + if (fixup_bug(regs, trapnr)) + return 0; + + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = trapnr; + die(str, regs, error_code); } return -1; @@ -255,7 +289,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", tsk->comm, tsk->pid, str, regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); + print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } @@ -519,7 +553,7 @@ do_general_protection(struct pt_regs *regs, long error_code) pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); + print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 46bcda4cb1c2..714dfba6a1e7 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -327,9 +327,16 @@ unsigned long long sched_clock(void) { return paravirt_sched_clock(); } + +bool using_native_sched_clock(void) +{ + return pv_time_ops.sched_clock == native_sched_clock; +} #else unsigned long long sched_clock(void) __attribute__((alias("native_sched_clock"))); + +bool using_native_sched_clock(void) { return true; } #endif int check_tsc_unstable(void) @@ -1112,8 +1119,10 @@ static void tsc_cs_mark_unstable(struct clocksource *cs) { if (tsc_unstable) return; + tsc_unstable = 1; - clear_sched_clock_stable(); + if (using_native_sched_clock()) + clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to clocksource watchdog\n"); } @@ -1135,18 +1144,20 @@ static struct clocksource clocksource_tsc = { void mark_tsc_unstable(char *reason) { - if (!tsc_unstable) { - tsc_unstable = 1; + if (tsc_unstable) + return; + + tsc_unstable = 1; + if (using_native_sched_clock()) clear_sched_clock_stable(); - disable_sched_clock_irqtime(); - pr_info("Marking TSC unstable due to %s\n", reason); - /* Change only the rating, when not registered */ - if (clocksource_tsc.mult) - clocksource_mark_unstable(&clocksource_tsc); - else { - clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; - clocksource_tsc.rating = 0; - } + disable_sched_clock_irqtime(); + pr_info("Marking TSC unstable due to %s\n", reason); + /* Change only the rating, when not registered */ + if (clocksource_tsc.mult) { + clocksource_mark_unstable(&clocksource_tsc); + } else { + clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; + clocksource_tsc.rating = 0; } } @@ -1322,6 +1333,8 @@ static int __init init_tsc_clocksource(void) * the refined calibration and directly register it as a clocksource. */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { + if (boot_cpu_has(X86_FEATURE_ART)) + art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); return 0; } diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 478d15dbaee4..82c6d7f1fd73 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -1,6 +1,8 @@ #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> +#include <linux/interrupt.h> +#include <asm/sections.h> #include <asm/ptrace.h> #include <asm/bitops.h> #include <asm/stacktrace.h> @@ -23,53 +25,53 @@ val; \ }) -static void unwind_dump(struct unwind_state *state, unsigned long *sp) +static void unwind_dump(struct unwind_state *state) { static bool dumped_before = false; bool prev_zero, zero = false; - unsigned long word; + unsigned long word, *sp; + struct stack_info stack_info = {0}; + unsigned long visit_mask = 0; if (dumped_before) return; dumped_before = true; - printk_deferred("unwind stack type:%d next_sp:%p mask:%lx graph_idx:%d\n", + printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n", state->stack_info.type, state->stack_info.next_sp, state->stack_mask, state->graph_idx); - for (sp = state->orig_sp; sp < state->stack_info.end; sp++) { - word = READ_ONCE_NOCHECK(*sp); + for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + if (get_stack_info(sp, state->task, &stack_info, &visit_mask)) + break; - prev_zero = zero; - zero = word == 0; + for (; sp < stack_info.end; sp++) { - if (zero) { - if (!prev_zero) - printk_deferred("%p: %016x ...\n", sp, 0); - continue; - } + word = READ_ONCE_NOCHECK(*sp); + + prev_zero = zero; + zero = word == 0; + + if (zero) { + if (!prev_zero) + printk_deferred("%p: %0*x ...\n", + sp, BITS_PER_LONG/4, 0); + continue; + } - printk_deferred("%p: %016lx (%pB)\n", sp, word, (void *)word); + printk_deferred("%p: %0*lx (%pB)\n", + sp, BITS_PER_LONG/4, word, (void *)word); + } } } unsigned long unwind_get_return_address(struct unwind_state *state) { - unsigned long addr; - unsigned long *addr_p = unwind_get_return_address_ptr(state); - if (unwind_done(state)) return 0; - if (state->regs && user_mode(state->regs)) - return 0; - - addr = READ_ONCE_TASK_STACK(state->task, *addr_p); - addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, - addr_p); - - return __kernel_text_address(addr) ? addr : 0; + return __kernel_text_address(state->ip) ? state->ip : 0; } EXPORT_SYMBOL_GPL(unwind_get_return_address); @@ -82,19 +84,68 @@ static size_t regs_size(struct pt_regs *regs) return sizeof(*regs); } +static bool in_entry_code(unsigned long ip) +{ + char *addr = (char *)ip; + + if (addr >= __entry_text_start && addr < __entry_text_end) + return true; + +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) + if (addr >= __irqentry_text_start && addr < __irqentry_text_end) + return true; +#endif + + return false; +} + +static inline unsigned long *last_frame(struct unwind_state *state) +{ + return (unsigned long *)task_pt_regs(state->task) - 2; +} + +#ifdef CONFIG_X86_32 +#define GCC_REALIGN_WORDS 3 +#else +#define GCC_REALIGN_WORDS 1 +#endif + +static inline unsigned long *last_aligned_frame(struct unwind_state *state) +{ + return last_frame(state) - GCC_REALIGN_WORDS; +} + static bool is_last_task_frame(struct unwind_state *state) { - unsigned long bp = (unsigned long)state->bp; - unsigned long regs = (unsigned long)task_pt_regs(state->task); + unsigned long *last_bp = last_frame(state); + unsigned long *aligned_bp = last_aligned_frame(state); /* * We have to check for the last task frame at two different locations * because gcc can occasionally decide to realign the stack pointer and - * change the offset of the stack frame by a word in the prologue of a - * function called by head/entry code. + * change the offset of the stack frame in the prologue of a function + * called by head/entry code. Examples: + * + * <start_secondary>: + * push %edi + * lea 0x8(%esp),%edi + * and $0xfffffff8,%esp + * pushl -0x4(%edi) + * push %ebp + * mov %esp,%ebp + * + * <x86_64_start_kernel>: + * lea 0x8(%rsp),%r10 + * and $0xfffffffffffffff0,%rsp + * pushq -0x8(%r10) + * push %rbp + * mov %rsp,%rbp + * + * Note that after aligning the stack, it pushes a duplicate copy of + * the return address before pushing the frame pointer. */ - return bp == regs - FRAME_HEADER_SIZE || - bp == regs - FRAME_HEADER_SIZE - sizeof(long); + return (state->bp == last_bp || + (state->bp == aligned_bp && *(aligned_bp+1) == *(last_bp+1))); } /* @@ -111,26 +162,70 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) return (struct pt_regs *)(regs & ~0x1); } -static bool update_stack_state(struct unwind_state *state, void *addr, - size_t len) +static bool update_stack_state(struct unwind_state *state, + unsigned long *next_bp) { struct stack_info *info = &state->stack_info; - enum stack_type orig_type = info->type; + enum stack_type prev_type = info->type; + struct pt_regs *regs; + unsigned long *frame, *prev_frame_end, *addr_p, addr; + size_t len; + + if (state->regs) + prev_frame_end = (void *)state->regs + regs_size(state->regs); + else + prev_frame_end = (void *)state->bp + FRAME_HEADER_SIZE; + + /* Is the next frame pointer an encoded pointer to pt_regs? */ + regs = decode_frame_pointer(next_bp); + if (regs) { + frame = (unsigned long *)regs; + len = regs_size(regs); + state->got_irq = true; + } else { + frame = next_bp; + len = FRAME_HEADER_SIZE; + } /* - * If addr isn't on the current stack, switch to the next one. + * If the next bp isn't on the current stack, switch to the next one. * * We may have to traverse multiple stacks to deal with the possibility - * that 'info->next_sp' could point to an empty stack and 'addr' could - * be on a subsequent stack. + * that info->next_sp could point to an empty stack and the next bp + * could be on a subsequent stack. */ - while (!on_stack(info, addr, len)) + while (!on_stack(info, frame, len)) if (get_stack_info(info->next_sp, state->task, info, &state->stack_mask)) return false; - if (!state->orig_sp || info->type != orig_type) - state->orig_sp = addr; + /* Make sure it only unwinds up and doesn't overlap the prev frame: */ + if (state->orig_sp && state->stack_info.type == prev_type && + frame < prev_frame_end) + return false; + + /* Move state to the next frame: */ + if (regs) { + state->regs = regs; + state->bp = NULL; + } else { + state->bp = next_bp; + state->regs = NULL; + } + + /* Save the return address: */ + if (state->regs && user_mode(state->regs)) + state->ip = 0; + else { + addr_p = unwind_get_return_address_ptr(state); + addr = READ_ONCE_TASK_STACK(state->task, *addr_p); + state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + addr, addr_p); + } + + /* Save the original stack pointer for unwind_dump(): */ + if (!state->orig_sp) + state->orig_sp = frame; return true; } @@ -138,14 +233,12 @@ static bool update_stack_state(struct unwind_state *state, void *addr, bool unwind_next_frame(struct unwind_state *state) { struct pt_regs *regs; - unsigned long *next_bp, *next_frame; - size_t next_len; - enum stack_type prev_type = state->stack_info.type; + unsigned long *next_bp; if (unwind_done(state)) return false; - /* have we reached the end? */ + /* Have we reached the end? */ if (state->regs && user_mode(state->regs)) goto the_end; @@ -173,58 +266,25 @@ bool unwind_next_frame(struct unwind_state *state) */ state->regs = regs; state->bp = NULL; + state->ip = 0; return true; } - /* get the next frame pointer */ + /* Get the next frame pointer: */ if (state->regs) next_bp = (unsigned long *)state->regs->bp; else - next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task,*state->bp); - - /* is the next frame pointer an encoded pointer to pt_regs? */ - regs = decode_frame_pointer(next_bp); - if (regs) { - next_frame = (unsigned long *)regs; - next_len = sizeof(*regs); - } else { - next_frame = next_bp; - next_len = FRAME_HEADER_SIZE; - } - - /* make sure the next frame's data is accessible */ - if (!update_stack_state(state, next_frame, next_len)) { - /* - * Don't warn on bad regs->bp. An interrupt in entry code - * might cause a false positive warning. - */ - if (state->regs) - goto the_end; + next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp); + /* Move to the next frame if it's safe: */ + if (!update_stack_state(state, next_bp)) goto bad_address; - } - - /* Make sure it only unwinds up and doesn't overlap the last frame: */ - if (state->stack_info.type == prev_type) { - if (state->regs && (void *)next_frame < (void *)state->regs + regs_size(state->regs)) - goto bad_address; - - if (state->bp && (void *)next_frame < (void *)state->bp + FRAME_HEADER_SIZE) - goto bad_address; - } - - /* move to the next frame */ - if (regs) { - state->regs = regs; - state->bp = NULL; - } else { - state->bp = next_bp; - state->regs = NULL; - } return true; bad_address: + state->error = true; + /* * When unwinding a non-current task, the task might actually be * running on another CPU, in which case it could be modifying its @@ -235,18 +295,29 @@ bad_address: if (state->task != current) goto the_end; + /* + * Don't warn if the unwinder got lost due to an interrupt in entry + * code or in the C handler before the first frame pointer got set up: + */ + if (state->got_irq && in_entry_code(state->ip)) + goto the_end; + if (state->regs && + state->regs->sp >= (unsigned long)last_aligned_frame(state) && + state->regs->sp < (unsigned long)task_pt_regs(state->task)) + goto the_end; + if (state->regs) { printk_deferred_once(KERN_WARNING "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", state->regs, state->task->comm, - state->task->pid, next_frame); - unwind_dump(state, (unsigned long *)state->regs); + state->task->pid, next_bp); + unwind_dump(state); } else { printk_deferred_once(KERN_WARNING "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", state->bp, state->task->comm, - state->task->pid, next_frame); - unwind_dump(state, state->bp); + state->task->pid, next_bp); + unwind_dump(state); } the_end: state->stack_info.type = STACK_TYPE_UNKNOWN; @@ -257,35 +328,24 @@ EXPORT_SYMBOL_GPL(unwind_next_frame); void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { - unsigned long *bp, *frame; - size_t len; + unsigned long *bp; memset(state, 0, sizeof(*state)); state->task = task; + state->got_irq = (regs); - /* don't even attempt to start from user mode regs */ + /* Don't even attempt to start from user mode regs: */ if (regs && user_mode(regs)) { state->stack_info.type = STACK_TYPE_UNKNOWN; return; } - /* set up the starting stack frame */ bp = get_frame_pointer(task, regs); - regs = decode_frame_pointer(bp); - if (regs) { - state->regs = regs; - frame = (unsigned long *)regs; - len = sizeof(*regs); - } else { - state->bp = bp; - frame = bp; - len = FRAME_HEADER_SIZE; - } - /* initialize stack info and make sure the frame data is accessible */ - get_stack_info(frame, state->task, &state->stack_info, + /* Initialize stack info and make sure the frame data is accessible: */ + get_stack_info(bp, state->task, &state->stack_info, &state->stack_mask); - update_stack_state(state, frame, len); + update_stack_state(state, bp); /* * The caller can provide the address of the first frame directly diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index 22881ddcbb9f..039f36738e49 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -34,7 +34,7 @@ bool unwind_next_frame(struct unwind_state *state) return true; } - state->sp = info->next_sp; + state->sp = PTR_ALIGN(info->next_sp, sizeof(long)); } while (!get_stack_info(state->sp, state->task, info, &state->stack_mask)); @@ -49,7 +49,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, memset(state, 0, sizeof(*state)); state->task = task; - state->sp = first_frame; + state->sp = PTR_ALIGN(first_frame, sizeof(long)); get_stack_info(first_frame, state->task, &state->stack_info, &state->stack_mask); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 23ee89ce59a9..7924a5356c8a 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -164,6 +164,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) struct vm_area_struct *vma; spinlock_t *ptl; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -173,7 +174,10 @@ static void mark_screen_rdonly(struct mm_struct *mm) pgd = pgd_offset(mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; - pud = pud_offset(pgd, 0xA0000); + p4d = p4d_offset(pgd, 0xA0000); + if (p4d_none_or_clear_bad(p4d)) + goto out; + pud = pud_offset(p4d, 0xA0000); if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); @@ -193,7 +197,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) pte_unmap_unlock(pte, ptl); out: up_write(&mm->mmap_sem); - flush_tlb(); + flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL); } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index c74ae9ce8dc4..c8a3b61be0aa 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -146,6 +146,7 @@ SECTIONS _edata = .; } :data + BUG_TABLE . = ALIGN(PAGE_SIZE); __vvar_page = .; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 11a93f005268..a088b2c47f73 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -14,7 +14,7 @@ #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/apic.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/time.h> #include <asm/irq.h> #include <asm/io_apic.h> @@ -38,7 +38,7 @@ struct x86_init_ops x86_init __initdata = { .resources = { .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, - .memory_setup = default_machine_specific_memory_setup, + .memory_setup = e820__memory_setup_default, }, .mpparse = { |