diff options
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/alternative.c | 7 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/amd.c | 13 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 40 | ||||
-rw-r--r-- | arch/x86/kernel/jump_label.c | 11 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 14 | ||||
-rw-r--r-- | arch/x86/kernel/kvmclock.c | 258 | ||||
-rw-r--r-- | arch/x86/kernel/setup.c | 10 | ||||
-rw-r--r-- | arch/x86/kernel/tsc.c | 259 | ||||
-rw-r--r-- | arch/x86/kernel/tsc_msr.c | 96 | ||||
-rw-r--r-- | arch/x86/kernel/x86_init.c | 2 |
10 files changed, 350 insertions, 360 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a481763a3776..014f214da581 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -668,6 +668,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, local_irq_save(flags); memcpy(addr, opcode, len); local_irq_restore(flags); + sync_core(); /* Could also do a CLFLUSH here to speed up CPU recovery; but that causes hangs on some VIA CPUs. */ return addr; @@ -693,6 +694,12 @@ void *text_poke(void *addr, const void *opcode, size_t len) struct page *pages[2]; int i; + /* + * While boot memory allocator is runnig we cannot use struct + * pages as they are not yet initialized. + */ + BUG_ON(!after_bootmem); + if (!core_kernel_text((unsigned long)addr)) { pages[0] = vmalloc_to_page(addr); pages[1] = vmalloc_to_page(addr + PAGE_SIZE); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 38915fbfae73..b732438c1a1e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -232,8 +232,6 @@ static void init_amd_k7(struct cpuinfo_x86 *c) } } - set_cpu_cap(c, X86_FEATURE_K7); - /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -617,6 +615,14 @@ static void early_init_amd(struct cpuinfo_x86 *c) early_init_amd_mc(c); +#ifdef CONFIG_X86_32 + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_K7); +#endif + + if (c->x86 >= 0xf) + set_cpu_cap(c, X86_FEATURE_K8); + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); /* @@ -863,9 +869,6 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); - if (c->x86 >= 0xf) - set_cpu_cap(c, X86_FEATURE_K8); - if (cpu_has(c, X86_FEATURE_XMM2)) { unsigned long long val; int ret; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index df28e931d732..ba6b8bb1c036 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1019,6 +1019,24 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) } /* + * The NOPL instruction is supposed to exist on all CPUs of family >= 6; + * unfortunately, that's not true in practice because of early VIA + * chips and (more importantly) broken virtualizers that are not easy + * to detect. In the latter case it doesn't even *fail* reliably, so + * probing for it doesn't even work. Disable it completely on 32-bit + * unless we can find a reliable way to detect all the broken cases. + * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). + */ +static void detect_nopl(void) +{ +#ifdef CONFIG_X86_32 + setup_clear_cpu_cap(X86_FEATURE_NOPL); +#else + setup_force_cpu_cap(X86_FEATURE_NOPL); +#endif +} + +/* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, * cache alignment. @@ -1092,6 +1110,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) */ if (!pgtable_l5_enabled()) setup_clear_cpu_cap(X86_FEATURE_LA57); + + detect_nopl(); } void __init early_cpu_init(void) @@ -1127,24 +1147,6 @@ void __init early_cpu_init(void) early_identify_cpu(&boot_cpu_data); } -/* - * The NOPL instruction is supposed to exist on all CPUs of family >= 6; - * unfortunately, that's not true in practice because of early VIA - * chips and (more importantly) broken virtualizers that are not easy - * to detect. In the latter case it doesn't even *fail* reliably, so - * probing for it doesn't even work. Disable it completely on 32-bit - * unless we can find a reliable way to detect all the broken cases. - * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). - */ -static void detect_nopl(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_32 - clear_cpu_cap(c, X86_FEATURE_NOPL); -#else - set_cpu_cap(c, X86_FEATURE_NOPL); -#endif -} - static void detect_null_seg_behavior(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 @@ -1207,8 +1209,6 @@ static void generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - detect_nopl(c); - detect_null_seg_behavior(c); /* diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index e56c95be2808..eeea935e9bb5 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -37,15 +37,18 @@ static void bug_at(unsigned char *ip, int line) BUG(); } -static void __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - void *(*poker)(void *, const void *, size_t), - int init) +static void __ref __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + void *(*poker)(void *, const void *, size_t), + int init) { union jump_code_union code; const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; + if (early_boot_irqs_disabled) + poker = text_poke_early; + if (type == JUMP_LABEL_JMP) { if (init) { /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a37bda38d205..09aaabb2bbf1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -45,7 +45,6 @@ #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> -#include <asm/kvm_guest.h> static int kvmapf = 1; @@ -66,15 +65,6 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); -static int kvmclock_vsyscall = 1; -static int __init parse_no_kvmclock_vsyscall(char *arg) -{ - kvmclock_vsyscall = 0; - return 0; -} - -early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); - static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; @@ -560,9 +550,6 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); - if (kvmclock_vsyscall) - kvm_setup_vsyscall_timeinfo(); - #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; @@ -628,6 +615,7 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, .type = X86_HYPER_KVM, + .init.init_platform = kvmclock_init, .init.guest_late_init = kvm_guest_init, .init.x2apic_available = kvm_para_available, }; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 3b8e7c13c614..d2edd7e6c294 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -23,30 +23,56 @@ #include <asm/apic.h> #include <linux/percpu.h> #include <linux/hardirq.h> -#include <linux/memblock.h> +#include <linux/cpuhotplug.h> #include <linux/sched.h> #include <linux/sched/clock.h> +#include <linux/mm.h> +#include <asm/hypervisor.h> #include <asm/mem_encrypt.h> #include <asm/x86_init.h> #include <asm/reboot.h> #include <asm/kvmclock.h> -static int kvmclock __ro_after_init = 1; -static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; -static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; -static u64 kvm_sched_clock_offset; +static int kvmclock __initdata = 1; +static int kvmclock_vsyscall __initdata = 1; +static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME; +static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK; +static u64 kvm_sched_clock_offset __ro_after_init; -static int parse_no_kvmclock(char *arg) +static int __init parse_no_kvmclock(char *arg) { kvmclock = 0; return 0; } early_param("no-kvmclock", parse_no_kvmclock); -/* The hypervisor will put information about time periodically here */ -static struct pvclock_vsyscall_time_info *hv_clock; -static struct pvclock_wall_clock *wall_clock; +static int __init parse_no_kvmclock_vsyscall(char *arg) +{ + kvmclock_vsyscall = 0; + return 0; +} +early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); + +/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ +#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) +#define HVC_BOOT_ARRAY_SIZE \ + (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) + +static struct pvclock_vsyscall_time_info + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE); +static struct pvclock_wall_clock wall_clock; +static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); + +static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) +{ + return &this_cpu_read(hv_clock_per_cpu)->pvti; +} + +static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) +{ + return this_cpu_read(hv_clock_per_cpu); +} /* * The wallclock is the time of day when we booted. Since then, some time may @@ -55,21 +81,10 @@ static struct pvclock_wall_clock *wall_clock; */ static void kvm_get_wallclock(struct timespec64 *now) { - struct pvclock_vcpu_time_info *vcpu_time; - int low, high; - int cpu; - - low = (int)slow_virt_to_phys(wall_clock); - high = ((u64)slow_virt_to_phys(wall_clock) >> 32); - - native_write_msr(msr_kvm_wall_clock, low, high); - - cpu = get_cpu(); - - vcpu_time = &hv_clock[cpu].pvti; - pvclock_read_wallclock(wall_clock, vcpu_time, now); - - put_cpu(); + wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); + preempt_disable(); + pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); + preempt_enable(); } static int kvm_set_wallclock(const struct timespec64 *now) @@ -79,14 +94,10 @@ static int kvm_set_wallclock(const struct timespec64 *now) static u64 kvm_clock_read(void) { - struct pvclock_vcpu_time_info *src; u64 ret; - int cpu; preempt_disable_notrace(); - cpu = smp_processor_id(); - src = &hv_clock[cpu].pvti; - ret = pvclock_clocksource_read(src); + ret = pvclock_clocksource_read(this_cpu_pvti()); preempt_enable_notrace(); return ret; } @@ -112,11 +123,11 @@ static inline void kvm_sched_clock_init(bool stable) kvm_sched_clock_offset = kvm_clock_read(); pv_time_ops.sched_clock = kvm_sched_clock_read; - printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", - kvm_sched_clock_offset); + pr_info("kvm-clock: using sched offset of %llu cycles", + kvm_sched_clock_offset); BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > - sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); } /* @@ -130,19 +141,11 @@ static inline void kvm_sched_clock_init(bool stable) */ static unsigned long kvm_get_tsc_khz(void) { - struct pvclock_vcpu_time_info *src; - int cpu; - unsigned long tsc_khz; - - cpu = get_cpu(); - src = &hv_clock[cpu].pvti; - tsc_khz = pvclock_tsc_khz(src); - put_cpu(); setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - return tsc_khz; + return pvclock_tsc_khz(this_cpu_pvti()); } -static void kvm_get_preset_lpj(void) +static void __init kvm_get_preset_lpj(void) { unsigned long khz; u64 lpj; @@ -156,49 +159,40 @@ static void kvm_get_preset_lpj(void) bool kvm_check_and_clear_guest_paused(void) { + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); bool ret = false; - struct pvclock_vcpu_time_info *src; - int cpu = smp_processor_id(); - if (!hv_clock) + if (!src) return ret; - src = &hv_clock[cpu].pvti; - if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { - src->flags &= ~PVCLOCK_GUEST_STOPPED; + if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) { + src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED; pvclock_touch_watchdogs(); ret = true; } - return ret; } struct clocksource kvm_clock = { - .name = "kvm-clock", - .read = kvm_clock_get_cycles, - .rating = 400, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .name = "kvm-clock", + .read = kvm_clock_get_cycles, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; EXPORT_SYMBOL_GPL(kvm_clock); -int kvm_register_clock(char *txt) +static void kvm_register_clock(char *txt) { - int cpu = smp_processor_id(); - int low, high, ret; - struct pvclock_vcpu_time_info *src; - - if (!hv_clock) - return 0; + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); + u64 pa; - src = &hv_clock[cpu].pvti; - low = (int)slow_virt_to_phys(src) | 1; - high = ((u64)slow_virt_to_phys(src) >> 32); - ret = native_write_msr_safe(msr_kvm_system_time, low, high); - printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", - cpu, high, low, txt); + if (!src) + return; - return ret; + pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; + wrmsrl(msr_kvm_system_time, pa); + pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); } static void kvm_save_sched_clock_state(void) @@ -213,11 +207,7 @@ static void kvm_restore_sched_clock_state(void) #ifdef CONFIG_X86_LOCAL_APIC static void kvm_setup_secondary_clock(void) { - /* - * Now that the first cpu already had this clocksource initialized, - * we shouldn't fail. - */ - WARN_ON(kvm_register_clock("secondary cpu clock")); + kvm_register_clock("secondary cpu clock"); } #endif @@ -245,100 +235,84 @@ static void kvm_shutdown(void) native_machine_shutdown(); } -static phys_addr_t __init kvm_memblock_alloc(phys_addr_t size, - phys_addr_t align) +static int __init kvm_setup_vsyscall_timeinfo(void) { - phys_addr_t mem; +#ifdef CONFIG_X86_64 + u8 flags; - mem = memblock_alloc(size, align); - if (!mem) + if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) return 0; - if (sev_active()) { - if (early_set_memory_decrypted((unsigned long)__va(mem), size)) - goto e_free; - } + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; - return mem; -e_free: - memblock_free(mem, size); + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; +#endif return 0; } +early_initcall(kvm_setup_vsyscall_timeinfo); -static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size) +static int kvmclock_setup_percpu(unsigned int cpu) { - if (sev_active()) - early_set_memory_encrypted((unsigned long)__va(addr), size); + struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu); - memblock_free(addr, size); + /* + * The per cpu area setup replicates CPU0 data to all cpu + * pointers. So carefully check. CPU0 has been set up in init + * already. + */ + if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0))) + return 0; + + /* Use the static page for the first CPUs, allocate otherwise */ + if (cpu < HVC_BOOT_ARRAY_SIZE) + p = &hv_clock_boot[cpu]; + else + p = kzalloc(sizeof(*p), GFP_KERNEL); + + per_cpu(hv_clock_per_cpu, cpu) = p; + return p ? 0 : -ENOMEM; } void __init kvmclock_init(void) { - struct pvclock_vcpu_time_info *vcpu_time; - unsigned long mem, mem_wall_clock; - int size, cpu, wall_clock_size; u8 flags; - size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - - if (!kvm_para_available()) + if (!kvm_para_available() || !kvmclock) return; - if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; - } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) - return; - - wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock)); - mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE); - if (!mem_wall_clock) - return; - - wall_clock = __va(mem_wall_clock); - memset(wall_clock, 0, wall_clock_size); - - mem = kvm_memblock_alloc(size, PAGE_SIZE); - if (!mem) { - kvm_memblock_free(mem_wall_clock, wall_clock_size); - wall_clock = NULL; + } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { return; } - hv_clock = __va(mem); - memset(hv_clock, 0, size); - - if (kvm_register_clock("primary cpu clock")) { - hv_clock = NULL; - kvm_memblock_free(mem, size); - kvm_memblock_free(mem_wall_clock, wall_clock_size); - wall_clock = NULL; + if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu", + kvmclock_setup_percpu, NULL) < 0) { return; } - printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + pr_info("kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); - pvclock_set_pvti_cpu0_va(hv_clock); + this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); + kvm_register_clock("primary cpu clock"); + pvclock_set_pvti_cpu0_va(hv_clock_boot); if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); - cpu = get_cpu(); - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); - put_cpu(); x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.calibrate_cpu = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.early_percpu_clock_init = - kvm_setup_secondary_clock; + x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; @@ -350,31 +324,3 @@ void __init kvmclock_init(void) clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; } - -int __init kvm_setup_vsyscall_timeinfo(void) -{ -#ifdef CONFIG_X86_64 - int cpu; - u8 flags; - struct pvclock_vcpu_time_info *vcpu_time; - unsigned int size; - - if (!hv_clock) - return 0; - - size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - - cpu = get_cpu(); - - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - - put_cpu(); - - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 1; - - kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; -#endif - return 0; -} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2f86d883dd95..5d32c55aeb8b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -866,6 +866,8 @@ void __init setup_arch(char **cmdline_p) idt_setup_early_traps(); early_cpu_init(); + arch_init_ideal_nops(); + jump_label_init(); early_ioremap_init(); setup_olpc_ofw_pgd(); @@ -1012,6 +1014,7 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor_platform(); + tsc_early_init(); x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ @@ -1197,11 +1200,6 @@ void __init setup_arch(char **cmdline_p) memblock_find_dma_reserve(); -#ifdef CONFIG_KVM_GUEST - kvmclock_init(); -#endif - - tsc_early_delay_calibrate(); if (!early_xdbc_setup_hardware()) early_xdbc_register_console(); @@ -1272,8 +1270,6 @@ void __init setup_arch(char **cmdline_p) mcheck_init(); - arch_init_ideal_nops(); - register_refined_jiffies(CLOCK_TICK_RATE); #ifdef CONFIG_EFI diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 74392d9d51e0..1463468ba9a0 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -33,16 +33,13 @@ EXPORT_SYMBOL(cpu_khz); unsigned int __read_mostly tsc_khz; EXPORT_SYMBOL(tsc_khz); +#define KHZ 1000 + /* * TSC can be unstable due to cpufreq or due to unsynced TSCs */ static int __read_mostly tsc_unstable; -/* native_sched_clock() is called before tsc_init(), so - we must start with the TSC soft disabled to prevent - erroneous rdtsc usage on !boot_cpu_has(X86_FEATURE_TSC) processors */ -static int __read_mostly tsc_disabled = -1; - static DEFINE_STATIC_KEY_FALSE(__use_tsc); int tsc_clocksource_reliable; @@ -106,23 +103,6 @@ void cyc2ns_read_end(void) * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static void cyc2ns_data_init(struct cyc2ns_data *data) -{ - data->cyc2ns_mul = 0; - data->cyc2ns_shift = 0; - data->cyc2ns_offset = 0; -} - -static void __init cyc2ns_init(int cpu) -{ - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - - cyc2ns_data_init(&c2n->data[0]); - cyc2ns_data_init(&c2n->data[1]); - - seqcount_init(&c2n->seq); -} - static inline unsigned long long cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; @@ -138,18 +118,11 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) return ns; } -static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long long ns_now; struct cyc2ns_data data; struct cyc2ns *c2n; - unsigned long flags; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - if (!khz) - goto done; ns_now = cycles_2_ns(tsc_now); @@ -181,13 +154,56 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_ c2n->data[0] = data; raw_write_seqcount_latch(&c2n->seq); c2n->data[1] = data; +} + +static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +{ + unsigned long flags; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + if (khz) + __set_cyc2ns_scale(khz, cpu, tsc_now); -done: sched_clock_idle_wakeup_event(); local_irq_restore(flags); } /* + * Initialize cyc2ns for boot cpu + */ +static void __init cyc2ns_init_boot_cpu(void) +{ + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + + seqcount_init(&c2n->seq); + __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); +} + +/* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. (cpufreq notifiers will fix this + * up if their speed diverges) + */ +static void __init cyc2ns_init_secondary_cpus(void) +{ + unsigned int cpu, this_cpu = smp_processor_id(); + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + struct cyc2ns_data *data = c2n->data; + + for_each_possible_cpu(cpu) { + if (cpu != this_cpu) { + seqcount_init(&c2n->seq); + c2n = per_cpu_ptr(&cyc2ns, cpu); + c2n->data[0] = data[0]; + c2n->data[1] = data[1]; + } + } +} + +/* * Scheduler clock - returns current time in nanosec units. */ u64 native_sched_clock(void) @@ -248,8 +264,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { - pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n"); - tsc_disabled = 1; + mark_tsc_unstable("boot parameter notsc"); return 1; } #else @@ -665,30 +680,17 @@ static unsigned long cpu_khz_from_cpuid(void) return eax_base_mhz * 1000; } -/** - * native_calibrate_cpu - calibrate the cpu on boot +/* + * calibrate cpu using pit, hpet, and ptimer methods. They are available + * later in boot after acpi is initialized. */ -unsigned long native_calibrate_cpu(void) +static unsigned long pit_hpet_ptimer_calibrate_cpu(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate; + unsigned long flags, latch, ms; int hpet = is_hpet_enabled(), i, loopmin; - fast_calibrate = cpu_khz_from_cpuid(); - if (fast_calibrate) - return fast_calibrate; - - fast_calibrate = cpu_khz_from_msr(); - if (fast_calibrate) - return fast_calibrate; - - local_irq_save(flags); - fast_calibrate = quick_pit_calibrate(); - local_irq_restore(flags); - if (fast_calibrate) - return fast_calibrate; - /* * Run 5 calibration loops to get the lowest frequency value * (the best estimate). We use two different calibration modes @@ -831,6 +833,37 @@ unsigned long native_calibrate_cpu(void) return tsc_pit_min; } +/** + * native_calibrate_cpu_early - can calibrate the cpu early in boot + */ +unsigned long native_calibrate_cpu_early(void) +{ + unsigned long flags, fast_calibrate = cpu_khz_from_cpuid(); + + if (!fast_calibrate) + fast_calibrate = cpu_khz_from_msr(); + if (!fast_calibrate) { + local_irq_save(flags); + fast_calibrate = quick_pit_calibrate(); + local_irq_restore(flags); + } + return fast_calibrate; +} + + +/** + * native_calibrate_cpu - calibrate the cpu + */ +static unsigned long native_calibrate_cpu(void) +{ + unsigned long tsc_freq = native_calibrate_cpu_early(); + + if (!tsc_freq) + tsc_freq = pit_hpet_ptimer_calibrate_cpu(); + + return tsc_freq; +} + void recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP @@ -1307,7 +1340,7 @@ unreg: static int __init init_tsc_clocksource(void) { - if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz) + if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz) return 0; if (tsc_unstable) @@ -1341,40 +1374,22 @@ unreg: */ device_initcall(init_tsc_clocksource); -void __init tsc_early_delay_calibrate(void) +static bool __init determine_cpu_tsc_frequencies(bool early) { - unsigned long lpj; - - if (!boot_cpu_has(X86_FEATURE_TSC)) - return; - - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); - - tsc_khz = tsc_khz ? : cpu_khz; - if (!tsc_khz) - return; - - lpj = tsc_khz * 1000; - do_div(lpj, HZ); - loops_per_jiffy = lpj; -} - -void __init tsc_init(void) -{ - u64 lpj, cyc; - int cpu; - - if (!boot_cpu_has(X86_FEATURE_TSC)) { - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); - return; + /* Make sure that cpu and tsc are not already calibrated */ + WARN_ON(cpu_khz || tsc_khz); + + if (early) { + cpu_khz = x86_platform.calibrate_cpu(); + tsc_khz = x86_platform.calibrate_tsc(); + } else { + /* We should not be here with non-native cpu calibration */ + WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); + cpu_khz = pit_hpet_ptimer_calibrate_cpu(); } - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); - /* - * Trust non-zero tsc_khz as authorative, + * Trust non-zero tsc_khz as authoritative, * and use it to sanity check cpu_khz, * which will be off if system timer is off. */ @@ -1383,52 +1398,78 @@ void __init tsc_init(void) else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; - if (!tsc_khz) { - mark_tsc_unstable("could not calculate TSC khz"); - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); - return; - } + if (tsc_khz == 0) + return false; pr_info("Detected %lu.%03lu MHz processor\n", - (unsigned long)cpu_khz / 1000, - (unsigned long)cpu_khz % 1000); + (unsigned long)cpu_khz / KHZ, + (unsigned long)cpu_khz % KHZ); if (cpu_khz != tsc_khz) { pr_info("Detected %lu.%03lu MHz TSC", - (unsigned long)tsc_khz / 1000, - (unsigned long)tsc_khz % 1000); + (unsigned long)tsc_khz / KHZ, + (unsigned long)tsc_khz % KHZ); } + return true; +} + +static unsigned long __init get_loops_per_jiffy(void) +{ + unsigned long lpj = tsc_khz * KHZ; + do_div(lpj, HZ); + return lpj; +} + +static void __init tsc_enable_sched_clock(void) +{ /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); + cyc2ns_init_boot_cpu(); + static_branch_enable(&__use_tsc); +} + +void __init tsc_early_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_TSC)) + return; + if (!determine_cpu_tsc_frequencies(true)) + return; + loops_per_jiffy = get_loops_per_jiffy(); + tsc_enable_sched_clock(); +} + +void __init tsc_init(void) +{ /* - * Secondary CPUs do not run through tsc_init(), so set up - * all the scale factors for all CPUs, assuming the same - * speed as the bootup CPU. (cpufreq notifiers will fix this - * up if their speed diverges) + * native_calibrate_cpu_early can only calibrate using methods that are + * available early in boot. */ - cyc = rdtsc(); - for_each_possible_cpu(cpu) { - cyc2ns_init(cpu); - set_cyc2ns_scale(tsc_khz, cpu, cyc); - } + if (x86_platform.calibrate_cpu == native_calibrate_cpu_early) + x86_platform.calibrate_cpu = native_calibrate_cpu; - if (tsc_disabled > 0) + if (!boot_cpu_has(X86_FEATURE_TSC)) { + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; + } - /* now allow native_sched_clock() to use rdtsc */ + if (!tsc_khz) { + /* We failed to determine frequencies earlier, try again */ + if (!determine_cpu_tsc_frequencies(false)) { + mark_tsc_unstable("could not calculate TSC khz"); + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return; + } + tsc_enable_sched_clock(); + } - tsc_disabled = 0; - static_branch_enable(&__use_tsc); + cyc2ns_init_secondary_cpus(); if (!no_sched_irq_time) enable_sched_clock_irqtime(); - lpj = ((u64)tsc_khz * 1000); - do_div(lpj, HZ); - lpj_fine = lpj; - + lpj_fine = get_loops_per_jiffy(); use_tsc_delay(); check_system_tsc_reliable(); @@ -1455,7 +1496,7 @@ unsigned long calibrate_delay_is_known(void) int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); const struct cpumask *mask = topology_core_cpumask(cpu); - if (tsc_disabled || !constant_tsc || !mask) + if (!constant_tsc || !mask) return 0; sibling = cpumask_any_but(mask, cpu); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 19afdbd7d0a7..27ef714d886c 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -1,17 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * tsc_msr.c - TSC frequency enumeration via MSR + * TSC frequency enumeration via MSR * - * Copyright (C) 2013 Intel Corporation + * Copyright (C) 2013, 2018 Intel Corporation * Author: Bin Gao <bin.gao@intel.com> - * - * This file is released under the GPLv2. */ #include <linux/kernel.h> -#include <asm/processor.h> -#include <asm/setup.h> + #include <asm/apic.h> +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> +#include <asm/msr.h> #include <asm/param.h> +#include <asm/tsc.h> #define MAX_NUM_FREQS 9 @@ -23,44 +25,48 @@ * field msr_plat does. */ struct freq_desc { - u8 x86_family; /* CPU family */ - u8 x86_model; /* model */ u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */ u32 freqs[MAX_NUM_FREQS]; }; -static struct freq_desc freq_desc_tables[] = { - /* PNW */ - { 6, 0x27, 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }, - /* CLV+ */ - { 6, 0x35, 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } }, - /* TNG - Intel Atom processor Z3400 series */ - { 6, 0x4a, 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } }, - /* VLV2 - Intel Atom processor E3000, Z3600, Z3700 series */ - { 6, 0x37, 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } }, - /* ANN - Intel Atom processor Z3500 series */ - { 6, 0x5a, 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } }, - /* AMT - Intel Atom processor X7-Z8000 and X5-Z8000 series */ - { 6, 0x4c, 1, { 83300, 100000, 133300, 116700, - 80000, 93300, 90000, 88900, 87500 } }, +/* + * Penwell and Clovertrail use spread spectrum clock, + * so the freq number is not exactly the same as reported + * by MSR based on SDM. + */ +static const struct freq_desc freq_desc_pnw = { + 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }; -static int match_cpu(u8 family, u8 model) -{ - int i; +static const struct freq_desc freq_desc_clv = { + 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } +}; - for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) { - if ((family == freq_desc_tables[i].x86_family) && - (model == freq_desc_tables[i].x86_model)) - return i; - } +static const struct freq_desc freq_desc_byt = { + 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } +}; - return -1; -} +static const struct freq_desc freq_desc_cht = { + 1, { 83300, 100000, 133300, 116700, 80000, 93300, 90000, 88900, 87500 } +}; -/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ -#define id_to_freq(cpu_index, freq_id) \ - (freq_desc_tables[cpu_index].freqs[freq_id]) +static const struct freq_desc freq_desc_tng = { + 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } +}; + +static const struct freq_desc freq_desc_ann = { + 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } +}; + +static const struct x86_cpu_id tsc_msr_cpu_ids[] = { + INTEL_CPU_FAM6(ATOM_PENWELL, freq_desc_pnw), + INTEL_CPU_FAM6(ATOM_CLOVERVIEW, freq_desc_clv), + INTEL_CPU_FAM6(ATOM_SILVERMONT1, freq_desc_byt), + INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht), + INTEL_CPU_FAM6(ATOM_MERRIFIELD, freq_desc_tng), + INTEL_CPU_FAM6(ATOM_MOOREFIELD, freq_desc_ann), + {} +}; /* * MSR-based CPU/TSC frequency discovery for certain CPUs. @@ -70,18 +76,17 @@ static int match_cpu(u8 family, u8 model) */ unsigned long cpu_khz_from_msr(void) { - u32 lo, hi, ratio, freq_id, freq; + u32 lo, hi, ratio, freq; + const struct freq_desc *freq_desc; + const struct x86_cpu_id *id; unsigned long res; - int cpu_index; - - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - return 0; - cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); - if (cpu_index < 0) + id = x86_match_cpu(tsc_msr_cpu_ids); + if (!id) return 0; - if (freq_desc_tables[cpu_index].msr_plat) { + freq_desc = (struct freq_desc *)id->driver_data; + if (freq_desc->msr_plat) { rdmsr(MSR_PLATFORM_INFO, lo, hi); ratio = (lo >> 8) & 0xff; } else { @@ -91,8 +96,9 @@ unsigned long cpu_khz_from_msr(void) /* Get FSB FREQ ID */ rdmsr(MSR_FSB_FREQ, lo, hi); - freq_id = lo & 0x7; - freq = id_to_freq(cpu_index, freq_id); + + /* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ + freq = freq_desc->freqs[lo & 0x7]; /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ res = freq * ratio; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 3ab867603e81..2792b5573818 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -109,7 +109,7 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; struct x86_platform_ops x86_platform __ro_after_init = { - .calibrate_cpu = native_calibrate_cpu, + .calibrate_cpu = native_calibrate_cpu_early, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, |