From 1be396794897f80bfc8774719ba60309a9e3d374 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:20 +0200 Subject: timekeeping: Move reset of cycle_last for tsc clocksource to tsc change_clocksource resets the cycle_last value to zero then sets it to a value read from the clocksource. The reset to zero is required only for the TSC clocksource to make the read_tsc function work after a resume. The reason is that the TSC read function uses cycle_last to detect backwards going TSCs. In the resume case cycle_last contains the TSC value from the last update before the suspend. On resume the TSC starts counting from 0 again and would trip over the cycle_last comparison. This is subtle and surprising. Move the reset to a resume function in the tsc code. Signed-off-by: Martin Schwidefsky Acked-by: Thomas Gleixner Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134808.142191175@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel/tsc.c') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 71f4368b357e..968425422c46 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -744,10 +744,16 @@ static cycle_t __vsyscall_fn vread_tsc(void) } #endif +static void resume_tsc(void) +{ + clocksource_tsc.cycle_last = 0; +} + static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, + .resume = resume_tsc, .mask = CLOCKSOURCE_MASK(64), .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS | -- cgit v1.2.3 From 7285dd7fd375763bfb8ab1ac9cf3f1206f503c16 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 28 Aug 2009 20:25:24 +0200 Subject: clocksource: Resolve cpu hotplug dead lock with TSC unstable Martin Schwidefsky analyzed it: To register a clocksource the clocksource_mutex is acquired and if necessary timekeeping_notify is called to install the clocksource as the timekeeper clock. timekeeping_notify uses stop_machine which needs to take cpu_add_remove_lock mutex. Starting a new cpu is done with the cpu_add_remove_lock mutex held. native_cpu_up checks the tsc of the new cpu and if the tsc is no good clocksource_change_rating is called. Which needs the clocksource_mutex and the deadlock is complete. The solution is to replace the TSC via the clocksource watchdog mechanism. Mark the TSC as unstable and schedule the watchdog work so it gets removed in the watchdog thread context. Signed-off-by: Thomas Gleixner LKML-Reference: Cc: Martin Schwidefsky Cc: John Stultz --- arch/x86/kernel/tsc.c | 8 +++++--- include/linux/clocksource.h | 1 + kernel/time/clocksource.c | 33 ++++++++++++++++++++++++++++++--- 3 files changed, 36 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel/tsc.c') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 968425422c46..fc3672a303d6 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -767,12 +767,14 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; - printk("Marking TSC unstable due to %s\n", reason); + printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) - clocksource_change_rating(&clocksource_tsc, 0); - else + clocksource_mark_unstable(&clocksource_tsc); + else { + clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; clocksource_tsc.rating = 0; + } } } diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 9ea40ff26f0e..83d2fbd81b93 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -277,6 +277,7 @@ extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_resume(void); extern struct clocksource * __init __weak clocksource_default_clock(void); +extern void clocksource_mark_unstable(struct clocksource *cs); #ifdef CONFIG_GENERIC_TIME_VSYSCALL extern void update_vsyscall(struct timespec *ts, struct clocksource *c); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e0c86ad6e9fb..a0af4ffcb6e5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -149,15 +149,42 @@ static void clocksource_watchdog_work(struct work_struct *work) kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); } -static void clocksource_unstable(struct clocksource *cs, int64_t delta) +static void __clocksource_unstable(struct clocksource *cs) { - printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", - cs->name, delta); cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; schedule_work(&watchdog_work); } +static void clocksource_unstable(struct clocksource *cs, int64_t delta) +{ + printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", + cs->name, delta); + __clocksource_unstable(cs); +} + +/** + * clocksource_mark_unstable - mark clocksource unstable via watchdog + * @cs: clocksource to be marked unstable + * + * This function is called instead of clocksource_change_rating from + * cpu hotplug code to avoid a deadlock between the clocksource mutex + * and the cpu hotplug mutex. It defers the update of the clocksource + * to the watchdog thread. + */ +void clocksource_mark_unstable(struct clocksource *cs) +{ + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { + if (list_empty(&cs->wd_list)) + list_add(&cs->wd_list, &watchdog_list); + __clocksource_unstable(cs); + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + static void clocksource_watchdog(unsigned long data) { struct clocksource *cs; -- cgit v1.2.3 From 845b3944bbdf9e9247849bf037f27ff3a3f26d87 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2009 15:37:03 +0200 Subject: x86: Add timer_init to x86_init_ops The timer init code is convoluted with several quirks and the paravirt timer chooser. Figuring out which code path is actually taken is not for the faint hearted. Move the numaq TSC quirk to tsc_pre_init x86_init_ops function and replace the paravirt time chooser and the remaining x86 quirk with a simple x86_init_ops function. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/paravirt.h | 5 ---- arch/x86/include/asm/paravirt_types.h | 2 -- arch/x86/include/asm/setup.h | 21 ++--------------- arch/x86/include/asm/time.h | 1 - arch/x86/include/asm/timer.h | 3 +-- arch/x86/include/asm/x86_init.h | 4 ++++ arch/x86/kernel/apic/numaq_32.c | 10 ++------ arch/x86/kernel/paravirt.c | 1 - arch/x86/kernel/setup.c | 43 ----------------------------------- arch/x86/kernel/time_32.c | 34 ++++++++++++++++++--------- arch/x86/kernel/time_64.c | 9 ++++++-- arch/x86/kernel/tsc.c | 2 ++ arch/x86/kernel/visws_quirks.c | 20 ++++------------ arch/x86/kernel/vmi_32.c | 2 +- arch/x86/kernel/x86_init.c | 3 +++ arch/x86/lguest/boot.c | 2 +- arch/x86/xen/enlighten.c | 4 ++-- 17 files changed, 53 insertions(+), 113 deletions(-) (limited to 'arch/x86/kernel/tsc.c') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 825674a968d1..11a4ba7b209c 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -34,11 +34,6 @@ static inline int set_wallclock(unsigned long nowtime) return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime); } -static inline void (*choose_time_init(void))(void) -{ - return pv_time_ops.time_init; -} - /* The paravirtualized CPUID instruction. */ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 1da89276d142..0d812e592e3b 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -88,8 +88,6 @@ struct pv_lazy_ops { }; struct pv_time_ops { - void (*time_init)(void); - /* Set and set time of day */ unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 58b58952b80d..861e1fe2303b 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -5,24 +5,6 @@ #define COMMAND_LINE_SIZE 2048 -#ifndef __ASSEMBLY__ - -#include - -/* - * Any setup quirks to be performed? - */ - -struct x86_quirks { - int (*arch_pre_time_init)(void); - int (*arch_time_init)(void); -}; - -extern void x86_quirk_pre_time_init(void); -extern void x86_quirk_time_init(void); - -#endif /* __ASSEMBLY__ */ - #ifdef __i386__ #include @@ -42,6 +24,7 @@ extern void x86_quirk_time_init(void); #ifndef __ASSEMBLY__ #include +#include /* Interrupt control for vSMPowered x86_64 systems */ #ifdef CONFIG_X86_64 @@ -60,11 +43,11 @@ static inline void visws_early_detect(void) { } static inline int is_visws_box(void) { return 0; } #endif -extern struct x86_quirks *x86_quirks; extern unsigned long saved_video_mode; extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); +extern void setup_default_timer_irq(void); #ifndef _SETUP diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index 50c733aac421..91bb162b5a31 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h @@ -54,7 +54,6 @@ extern void time_init(void); #define get_wallclock() native_get_wallclock() #define set_wallclock(x) native_set_wallclock(x) -#define choose_time_init() hpet_time_init #endif /* CONFIG_PARAVIRT */ diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 20ca9c4d4686..e854c7ab4169 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -12,8 +12,7 @@ unsigned long native_calibrate_tsc(void); #ifdef CONFIG_X86_32 extern int timer_ack; -extern irqreturn_t timer_interrupt(int irq, void *dev_id); -#endif /* CONFIG_X86_32 */ +#endif extern int recalibrate_cpu_khz(void); extern int no_timer_check; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index b7d258f4c401..f8bdd2271a04 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -82,9 +82,13 @@ struct x86_init_paging { * struct x86_init_timers - platform specific timer setup * @setup_perpcu_clockev: set up the per cpu clock event device for the * boot cpu + * @tsc_pre_init: platform function called before TSC init + * @timer_init: initialize the platform timer (default PIT/HPET) */ struct x86_init_timers { void (*setup_percpu_clockev)(void); + void (*tsc_pre_init)(void); + void (*timer_init)(void); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 71c5ea645865..f1ebed6bd150 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -129,10 +129,9 @@ void __cpuinit numaq_tsc_disable(void) } } -static int __init numaq_pre_time_init(void) +static void __init numaq_tsc_init(void) { numaq_tsc_disable(); - return 0; } static inline int generate_logical_apicid(int quad, int phys_apicid) @@ -262,11 +261,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) } } -static struct x86_quirks numaq_x86_quirks __initdata = { - .arch_pre_time_init = numaq_pre_time_init, - .arch_time_init = NULL, -}; - static __init void early_check_numaq(void) { /* @@ -281,13 +275,13 @@ static __init void early_check_numaq(void) early_get_smp_config(); if (found_numaq) { - x86_quirks = &numaq_x86_quirks; x86_init.mpparse.mpc_record = numaq_mpc_record; x86_init.mpparse.setup_ioapic_ids = x86_init_noop; x86_init.mpparse.mpc_apic_id = mpc_apic_id; x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; + x86_init.timers.tsc_pre_init = numaq_tsc_init; } } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1ed32c79679d..9c0e644a76dc 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -306,7 +306,6 @@ struct pv_init_ops pv_init_ops = { }; struct pv_time_ops pv_time_ops = { - .time_init = hpet_time_init, .get_wallclock = native_get_wallclock, .set_wallclock = native_set_wallclock, .sched_clock = native_sched_clock, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 43ec6aa175bd..bb207a47c631 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -626,10 +626,6 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif -static struct x86_quirks default_x86_quirks __initdata; - -struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; - #ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { @@ -1016,45 +1012,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, - .name = "timer" -}; - -/** - * x86_quirk_pre_time_init - do any specific initialisations before. - * - **/ -void __init x86_quirk_pre_time_init(void) -{ - if (x86_quirks->arch_pre_time_init) - x86_quirks->arch_pre_time_init(); -} - -/** - * x86_quirk_time_init - do any specific initialisations for the system timer. - * - * Description: - * Must plug the system timer interrupt source at HZ into the IRQ listed - * in irq_vectors.h:TIMER_IRQ - **/ -void __init x86_quirk_time_init(void) -{ - if (x86_quirks->arch_time_init) { - /* - * A nonzero return code does not mean failure, it means - * that the architecture quirk does not want any - * generic (timer) setup to be performed after this: - */ - if (x86_quirks->arch_time_init()) - return; - } - - irq0.mask = cpumask_of_cpu(0); - setup_irq(0, &irq0); -} - static struct resource video_ram_resource = { .name = "Video RAM area", .start = 0xa0000, diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 5c5d87f0b2e1..89bbb52218b8 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -72,7 +72,7 @@ EXPORT_SYMBOL(profile_pc); * Time Stamp Counter value at the time of the timer interrupt, so that * we later on can estimate the time of day more exactly. */ -irqreturn_t timer_interrupt(int irq, void *dev_id) +static irqreturn_t timer_interrupt(int irq, void *dev_id) { /* Keep nmi watchdog up to date */ inc_irq_stat(irq0_irqs); @@ -113,25 +113,37 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -/* Duplicate of time_init() below, with hpet_enable part added */ +static struct irqaction irq0 = { + .handler = timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, + .name = "timer" +}; + +void __init setup_default_timer_irq(void) +{ + irq0.mask = cpumask_of_cpu(0); + setup_irq(0, &irq0); +} + +/* Default timer init function */ void __init hpet_time_init(void) { if (!hpet_enable()) setup_pit_timer(); - x86_quirk_time_init(); + setup_default_timer_irq(); +} + +static void x86_late_time_init(void) +{ + x86_init.timers.timer_init(); } /* - * This is called directly from init code; we must delay timer setup in the - * HPET case as we can't make the decision to turn on HPET this early in the - * boot process. - * - * The chosen time_init function will usually be hpet_time_init, above, but - * in the case of virtual hardware, an alternative function may be substituted. + * Initialize TSC and delay the periodic timer init to + * late x86_late_time_init() so ioremap works. */ void __init time_init(void) { - x86_quirk_pre_time_init(); tsc_init(); - late_time_init = choose_time_init(); + late_time_init = x86_late_time_init; } diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 5ba343e61844..38a7df94c107 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -127,9 +128,13 @@ void __init hpet_time_init(void) setup_irq(0, &irq0); } +static void x86_late_time_init(void) +{ + x86_init.timers.timer_init(); +} + void __init time_init(void) { tsc_init(); - - late_time_init = choose_time_init(); + late_time_init = x86_late_time_init; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 71f4368b357e..652bc214eebf 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -857,6 +857,8 @@ void __init tsc_init(void) u64 lpj; int cpu; + x86_init.timers.tsc_pre_init(); + if (!cpu_has_tsc) return; diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 2719091b3351..f068553a1b17 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -53,7 +54,7 @@ int is_visws_box(void) return visws_board_type >= 0; } -static int __init visws_time_init(void) +static void __init visws_time_init(void) { printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -66,11 +67,7 @@ static int __init visws_time_init(void) /* Enable (unmask) the timer interrupt */ co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); - /* - * Zero return means the generic timer setup code will set up - * the standard vector: - */ - return 0; + setup_default_timer_irq(); } /* Replaces the default init_ISA_irqs in the generic setup */ @@ -226,10 +223,6 @@ static void __init visws_find_smp_config(unsigned int reserve) static void visws_trap_init(void); -static struct x86_quirks visws_x86_quirks __initdata = { - .arch_time_init = visws_time_init, -}; - void __init visws_early_detect(void) { int raw; @@ -241,17 +234,14 @@ void __init visws_early_detect(void) return; /* - * Install special quirks for timer, interrupt and memory setup: - * Fall back to generic behavior for traps: - * Override generic MP-table parsing: + * Override the default platform setup functions */ - x86_quirks = &visws_x86_quirks; - x86_init.resources.memory_setup = visws_memory_setup; x86_init.mpparse.get_smp_config = visws_get_smp_config; x86_init.mpparse.find_smp_config = visws_find_smp_config; x86_init.irqs.pre_vector_init = visws_pre_intr_init; x86_init.irqs.trap_init = visws_trap_init; + x86_init.timers.timer_init = visws_time_init; /* * Install reboot quirks: diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b43b6685cae1..cd7d0fbbf66e 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -817,7 +817,7 @@ static inline int __init activate_vmi(void) vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); vmi_timer_ops.cancel_alarm = vmi_get_function(VMI_CALL_CancelAlarm); - pv_time_ops.time_init = vmi_time_init; + x86_init.timers.timer_init = vmi_time_init; pv_time_ops.get_wallclock = vmi_get_wallclock; pv_time_ops.set_wallclock = vmi_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index e666a98db7cd..4790b92714a6 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -11,6 +11,7 @@ #include #include #include +#include #include void __cpuinit x86_init_noop(void) { } @@ -58,6 +59,8 @@ struct __initdata x86_init_ops x86_init = { .timers = { .setup_percpu_clockev = setup_boot_APIC_clock, + .tsc_pre_init = x86_init_noop, + .timer_init = hpet_time_init, }, }; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 1ff986511f1d..6caa8c0c793b 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1320,11 +1320,11 @@ __init void lguest_init(void) /* Time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; - pv_time_ops.time_init = lguest_time_init; pv_time_ops.get_tsc_khz = lguest_tsc_khz; x86_init.resources.memory_setup = lguest_memory_setup; x86_init.irqs.intr_init = lguest_init_IRQ; + x86_init.timers.timer_init = lguest_time_init; /* * Now is a good time to look at the implementations of these functions diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 14e597e0c160..84826b842b54 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -842,8 +842,6 @@ static const struct pv_init_ops xen_init_ops __initdata = { }; static const struct pv_time_ops xen_time_ops __initdata = { - .time_init = xen_time_init, - .set_wallclock = xen_set_wallclock, .get_wallclock = xen_get_wallclock, .get_tsc_khz = xen_tsc_khz, @@ -977,6 +975,8 @@ asmlinkage void __init xen_start_kernel(void) x86_init.resources.memory_setup = xen_memory_setup; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; + + x86_init.timers.timer_init = xen_time_init; x86_init.timers.setup_percpu_clockev = x86_init_noop; x86_cpuinit.setup_percpu_clockev = x86_init_noop; -- cgit v1.2.3 From 08047c4f1740c7cee75d58e2919d48c09f951649 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 16:27:41 +0200 Subject: x86: Move calibrate_cpu to tsc.c Move the code where it's only user is. Also we need to look whether this hardwired hackery might interfere with perfcounters. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/time.h | 2 -- arch/x86/kernel/time_32.c | 1 - arch/x86/kernel/time_64.c | 51 ---------------------------------------- arch/x86/kernel/tsc.c | 57 +++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 55 insertions(+), 56 deletions(-) (limited to 'arch/x86/kernel/tsc.c') diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index 91bb162b5a31..9c5608b21c27 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h @@ -57,6 +57,4 @@ extern void time_init(void); #endif /* CONFIG_PARAVIRT */ -extern unsigned long __init calibrate_cpu(void); - #endif /* _ASM_X86_TIME_H */ diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 186abc577b2b..fd876cc77487 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -21,7 +21,6 @@ #include #include #include -#include #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) int timer_ack; diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 78cbdf5c006b..e59a40ebff14 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -21,7 +21,6 @@ #include #include #include -#include #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) int timer_ack; @@ -84,56 +83,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -/* - * calibrate_cpu is used on systems with fixed rate TSCs to determine - * processor frequency - */ -#define TICK_COUNT 100000000 -unsigned long __init calibrate_cpu(void) -{ - int tsc_start, tsc_now; - int i, no_ctr_free; - unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; - unsigned long flags; - - for (i = 0; i < 4; i++) - if (avail_to_resrv_perfctr_nmi_bit(i)) - break; - no_ctr_free = (i == 4); - if (no_ctr_free) { - WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " - "cpu_khz value may be incorrect.\n"); - i = 3; - rdmsrl(MSR_K7_EVNTSEL3, evntsel3); - wrmsrl(MSR_K7_EVNTSEL3, 0); - rdmsrl(MSR_K7_PERFCTR3, pmc3); - } else { - reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); - reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - local_irq_save(flags); - /* start measuring cycles, incrementing from 0 */ - wrmsrl(MSR_K7_PERFCTR0 + i, 0); - wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); - rdtscl(tsc_start); - do { - rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles(); - } while ((tsc_now - tsc_start) < TICK_COUNT); - - local_irq_restore(flags); - if (no_ctr_free) { - wrmsrl(MSR_K7_EVNTSEL3, 0); - wrmsrl(MSR_K7_PERFCTR3, pmc3); - wrmsrl(MSR_K7_EVNTSEL3, evntsel3); - } else { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - - return pmc_now * tsc_khz / (tsc_now - tsc_start); -} - static struct irqaction irq0 = { .handler = timer_interrupt, .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 652bc214eebf..97a0bcbad100 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -17,6 +17,7 @@ #include #include #include +#include unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -852,6 +853,60 @@ static void __init init_tsc_clocksource(void) clocksource_register(&clocksource_tsc); } +#ifdef CONFIG_X86_64 +/* + * calibrate_cpu is used on systems with fixed rate TSCs to determine + * processor frequency + */ +#define TICK_COUNT 100000000 +static unsigned long __init calibrate_cpu(void) +{ + int tsc_start, tsc_now; + int i, no_ctr_free; + unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; + unsigned long flags; + + for (i = 0; i < 4; i++) + if (avail_to_resrv_perfctr_nmi_bit(i)) + break; + no_ctr_free = (i == 4); + if (no_ctr_free) { + WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " + "cpu_khz value may be incorrect.\n"); + i = 3; + rdmsrl(MSR_K7_EVNTSEL3, evntsel3); + wrmsrl(MSR_K7_EVNTSEL3, 0); + rdmsrl(MSR_K7_PERFCTR3, pmc3); + } else { + reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); + reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + local_irq_save(flags); + /* start measuring cycles, incrementing from 0 */ + wrmsrl(MSR_K7_PERFCTR0 + i, 0); + wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); + rdtscl(tsc_start); + do { + rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); + tsc_now = get_cycles(); + } while ((tsc_now - tsc_start) < TICK_COUNT); + + local_irq_restore(flags); + if (no_ctr_free) { + wrmsrl(MSR_K7_EVNTSEL3, 0); + wrmsrl(MSR_K7_PERFCTR3, pmc3); + wrmsrl(MSR_K7_EVNTSEL3, evntsel3); + } else { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + + return pmc_now * tsc_khz / (tsc_now - tsc_start); +} +#else +static inline unsigned long calibrate_cpu(void) { return cpu_khz; } +#endif + void __init tsc_init(void) { u64 lpj; @@ -870,11 +925,9 @@ void __init tsc_init(void) return; } -#ifdef CONFIG_X86_64 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) cpu_khz = calibrate_cpu(); -#endif printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, -- cgit v1.2.3 From 2d826404f0bdcac2a4dd7e3c446b70d6a3b63b78 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 17:06:25 +0200 Subject: x86: Move tsc_calibration to x86_init_ops TSC calibration is modified by the vmware hypervisor and paravirt by separate means. Moorestown wants to add its own calibration routine as well. So make calibrate_tsc a proper x86_init_ops function and override it by paravirt or by the early setup of the vmware hypervisor. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hypervisor.h | 2 +- arch/x86/include/asm/paravirt.h | 1 - arch/x86/include/asm/timer.h | 5 ----- arch/x86/include/asm/tsc.h | 3 ++- arch/x86/include/asm/vmware.h | 2 +- arch/x86/include/asm/x86_init.h | 9 +++++++++ arch/x86/kernel/cpu/hypervisor.c | 14 +++++++------- arch/x86/kernel/cpu/vmware.c | 21 ++++++++++++--------- arch/x86/kernel/kvmclock.c | 2 +- arch/x86/kernel/paravirt.c | 1 - arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/tsc.c | 13 ++++--------- arch/x86/kernel/vmi_32.c | 2 +- arch/x86/kernel/vmiclock_32.c | 2 +- arch/x86/kernel/x86_init.c | 5 +++++ arch/x86/lguest/boot.c | 2 +- arch/x86/xen/enlighten.c | 3 ++- 17 files changed, 48 insertions(+), 41 deletions(-) (limited to 'arch/x86/kernel/tsc.c') diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 369f5c5d09a1..b78c0941e422 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -20,7 +20,7 @@ #ifndef ASM_X86__HYPERVISOR_H #define ASM_X86__HYPERVISOR_H -extern unsigned long get_hypervisor_tsc_freq(void); extern void init_hypervisor(struct cpuinfo_x86 *c); +extern void init_hypervisor_platform(void); #endif diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 11a4ba7b209c..1e458a553303 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -210,7 +210,6 @@ static inline unsigned long long paravirt_sched_clock(void) { return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); } -#define calibrate_tsc() (pv_time_ops.get_tsc_khz()) static inline unsigned long long paravirt_read_pmc(int counter) { diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 65228ccc5f0d..5469630b27f5 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -8,7 +8,6 @@ #define TICK_SIZE (tick_nsec / 1000) unsigned long long native_sched_clock(void); -unsigned long native_calibrate_tsc(void); extern int recalibrate_cpu_khz(void); #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) @@ -19,10 +18,6 @@ extern int timer_ack; extern int no_timer_check; -#ifndef CONFIG_PARAVIRT -#define calibrate_tsc() native_calibrate_tsc() -#endif - /* Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 38ae163cc91b..c0427295e8f5 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -48,7 +48,8 @@ static __always_inline cycles_t vget_cycles(void) extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); -int check_tsc_unstable(void); +extern int check_tsc_unstable(void); +extern unsigned long native_calibrate_tsc(void); /* * Boot-time check whether the TSCs are synchronized across diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h index c11b7e100d83..e49ed6d2fd4e 100644 --- a/arch/x86/include/asm/vmware.h +++ b/arch/x86/include/asm/vmware.h @@ -20,7 +20,7 @@ #ifndef ASM_X86__VMWARE_H #define ASM_X86__VMWARE_H -extern unsigned long vmware_get_tsc_khz(void); +extern void vmware_platform_setup(void); extern int vmware_platform(void); extern void vmware_set_feature_bits(struct cpuinfo_x86 *c); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index f8bdd2271a04..20df51871713 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -112,8 +112,17 @@ struct x86_cpuinit_ops { void (*setup_percpu_clockev)(void); }; +/** + * struct x86_platform_ops - platform specific runtime functions + * @calibrate_tsc: calibrate TSC + */ +struct x86_platform_ops { + unsigned long (*calibrate_tsc)(void); +}; + extern struct x86_init_ops x86_init; extern struct x86_cpuinit_ops x86_cpuinit; +extern struct x86_platform_ops x86_platform; extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 93ba8eeb100a..08be922de33a 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,13 +34,6 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c) c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; } -unsigned long get_hypervisor_tsc_freq(void) -{ - if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) - return vmware_get_tsc_khz(); - return 0; -} - static inline void __cpuinit hypervisor_set_feature_bits(struct cpuinfo_x86 *c) { @@ -55,3 +48,10 @@ void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) detect_hypervisor_vendor(c); hypervisor_set_feature_bits(c); } + +void __init init_hypervisor_platform(void) +{ + init_hypervisor(&boot_cpu_data); + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) + vmware_platform_setup(); +} diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index bc24f514ec93..0a46b4df5d80 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -24,6 +24,7 @@ #include #include #include +#include #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -47,21 +48,29 @@ static inline int __vmware_platform(void) return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; } -static unsigned long __vmware_get_tsc_khz(void) +static unsigned long vmware_get_tsc_khz(void) { uint64_t tsc_hz; uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); - if (ebx == UINT_MAX) - return 0; tsc_hz = eax | (((uint64_t)ebx) << 32); do_div(tsc_hz, 1000); BUG_ON(tsc_hz >> 32); return tsc_hz; } +void __init vmware_platform_setup(void) +{ + uint32_t eax, ebx, ecx, edx; + + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + + if (ebx != UINT_MAX) + x86_platform.calibrate_tsc = vmware_get_tsc_khz; +} + /* * While checking the dmi string infomation, just checking the product * serial key should be enough, as this will always have a VMware @@ -87,12 +96,6 @@ int vmware_platform(void) return 0; } -unsigned long vmware_get_tsc_khz(void) -{ - BUG_ON(!vmware_platform()); - return __vmware_get_tsc_khz(); -} - /* * VMware hypervisor takes care of exporting a reliable TSC to the guest. * Still, due to timing difference when running on virtual cpus, the TSC can diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 64e9b5f59d2d..75a21b61b863 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -187,7 +187,7 @@ void __init kvmclock_init(void) pv_time_ops.get_wallclock = kvm_get_wallclock; pv_time_ops.set_wallclock = kvm_set_wallclock; pv_time_ops.sched_clock = kvm_clock_read; - pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; + x86_platform.calibrate_tsc = kvm_get_tsc_khz; #ifdef CONFIG_X86_LOCAL_APIC x86_cpuinit.setup_percpu_clockev = kvm_setup_secondary_clock; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 9c0e644a76dc..7cbf898d839b 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -309,7 +309,6 @@ struct pv_time_ops pv_time_ops = { .get_wallclock = native_get_wallclock, .set_wallclock = native_set_wallclock, .sched_clock = native_sched_clock, - .get_tsc_khz = native_calibrate_tsc, }; struct pv_irq_ops pv_irq_ops = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bb207a47c631..2d93026af7cd 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -818,7 +818,7 @@ void __init setup_arch(char **cmdline_p) * VMware detection requires dmi to be available, so this * needs to be done after dmi_scan_machine, for the BP. */ - init_hypervisor(&boot_cpu_data); + init_hypervisor_platform(); x86_init.resources.probe_roms(); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 97a0bcbad100..9917632a8b49 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -18,6 +18,7 @@ #include #include #include +#include unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -401,15 +402,9 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; + unsigned long flags, latch, ms, fast_calibrate; int hpet = is_hpet_enabled(), i, loopmin; - hv_tsc_khz = get_hypervisor_tsc_freq(); - if (hv_tsc_khz) { - printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); - return hv_tsc_khz; - } - local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); @@ -567,7 +562,7 @@ int recalibrate_cpu_khz(void) unsigned long cpu_khz_old = cpu_khz; if (cpu_has_tsc) { - tsc_khz = calibrate_tsc(); + tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, @@ -917,7 +912,7 @@ void __init tsc_init(void) if (!cpu_has_tsc) return; - tsc_khz = calibrate_tsc(); + tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; if (!tsc_khz) { diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index cd7d0fbbf66e..052ae81ee08b 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -825,7 +825,7 @@ static inline int __init activate_vmi(void) x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init; #endif pv_time_ops.sched_clock = vmi_sched_clock; - pv_time_ops.get_tsc_khz = vmi_tsc_khz; + x86_platform.calibrate_tsc = vmi_tsc_khz; /* We have true wallclock functions; disable CMOS clock sync */ no_sync_cmos_clock = 1; diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 2b3eb82efeeb..611b9e2360d3 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -68,7 +68,7 @@ unsigned long long vmi_sched_clock(void) return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); } -/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ +/* x86_platform.calibrate_tsc = vmi_tsc_khz */ unsigned long vmi_tsc_khz(void) { unsigned long long khz; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 4790b92714a6..13081b921914 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -13,6 +13,7 @@ #include #include #include +#include void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } @@ -67,3 +68,7 @@ struct __initdata x86_init_ops x86_init = { __cpuinitdata struct x86_cpuinit_ops x86_cpuinit = { .setup_percpu_clockev = setup_secondary_APIC_clock, }; + +struct x86_platform_ops x86_platform = { + .calibrate_tsc = native_calibrate_tsc, +}; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 6caa8c0c793b..fabe745513d9 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1320,11 +1320,11 @@ __init void lguest_init(void) /* Time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; - pv_time_ops.get_tsc_khz = lguest_tsc_khz; x86_init.resources.memory_setup = lguest_memory_setup; x86_init.irqs.intr_init = lguest_init_IRQ; x86_init.timers.timer_init = lguest_time_init; + x86_platform.calibrate_tsc = lguest_tsc_khz; /* * Now is a good time to look at the implementations of these functions diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 84826b842b54..ee8cac77c8a4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -844,7 +844,6 @@ static const struct pv_init_ops xen_init_ops __initdata = { static const struct pv_time_ops xen_time_ops __initdata = { .set_wallclock = xen_set_wallclock, .get_wallclock = xen_get_wallclock, - .get_tsc_khz = xen_tsc_khz, .sched_clock = xen_sched_clock, }; @@ -980,6 +979,8 @@ asmlinkage void __init xen_start_kernel(void) x86_init.timers.setup_percpu_clockev = x86_init_noop; x86_cpuinit.setup_percpu_clockev = x86_init_noop; + x86_platform.calibrate_tsc = xen_tsc_khz; + #ifdef CONFIG_X86_64 /* * Setup percpu state. We only need to do this for 64-bit -- cgit v1.2.3