diff options
Diffstat (limited to 'arch/x86_64/kernel/time.c')
-rw-r--r-- | arch/x86_64/kernel/time.c | 236 |
1 files changed, 140 insertions, 96 deletions
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 7392570f975d..7ea3bf2a858c 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -8,7 +8,7 @@ * Copyright (c) 1995 Markus Kuhn * Copyright (c) 1996 Ingo Molnar * Copyright (c) 1998 Andrea Arcangeli - * Copyright (c) 2002 Vojtech Pavlik + * Copyright (c) 2002,2006 Vojtech Pavlik * Copyright (c) 2003 Andi Kleen * RTC support code taken from arch/i386/kernel/timers/time_hpet.c */ @@ -24,10 +24,13 @@ #include <linux/device.h> #include <linux/sysdev.h> #include <linux/bcd.h> +#include <linux/notifier.h> +#include <linux/cpu.h> #include <linux/kallsyms.h> #include <linux/acpi.h> #ifdef CONFIG_ACPI #include <acpi/achware.h> /* for PM timer frequency */ +#include <acpi/acpi_bus.h> #endif #include <asm/8253pit.h> #include <asm/pgtable.h> @@ -38,9 +41,7 @@ #include <asm/sections.h> #include <linux/cpufreq.h> #include <linux/hpet.h> -#ifdef CONFIG_X86_LOCAL_APIC #include <asm/apic.h> -#endif #ifdef CONFIG_CPU_FREQ static void cpufreq_delayed_get(void); @@ -48,17 +49,24 @@ static void cpufreq_delayed_get(void); extern void i8254_timer_resume(void); extern int using_apic_timer; -static char *time_init_gtod(void); +static char *timename = NULL; DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL(rtc_lock); DEFINE_SPINLOCK(i8253_lock); int nohpet __initdata = 0; static int notsc __initdata = 0; -#undef HPET_HACK_ENABLE_DANGEROUS +#define USEC_PER_TICK (USEC_PER_SEC / HZ) +#define NSEC_PER_TICK (NSEC_PER_SEC / HZ) +#define FSEC_PER_TICK (FSEC_PER_SEC / HZ) + +#define NS_SCALE 10 /* 2^10, carefully chosen */ +#define US_SCALE 32 /* 2^32, arbitralrily chosen */ unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +EXPORT_SYMBOL(cpu_khz); static unsigned long hpet_period; /* fsecs / HPET clock */ unsigned long hpet_tick; /* HPET clocks / interrupt */ int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */ @@ -90,7 +98,7 @@ static inline unsigned int do_gettimeoffset_tsc(void) t = get_cycles_sync(); if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */ - x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32; + x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE; return x; } @@ -98,7 +106,7 @@ static inline unsigned int do_gettimeoffset_hpet(void) { /* cap counter read to one tick to avoid inconsistencies */ unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; - return (min(counter,hpet_tick) * vxtime.quot) >> 32; + return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE; } unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; @@ -118,7 +126,7 @@ void do_gettimeofday(struct timeval *tv) seq = read_seqbegin(&xtime_lock); sec = xtime.tv_sec; - usec = xtime.tv_nsec / 1000; + usec = xtime.tv_nsec / NSEC_PER_USEC; /* i386 does some correction here to keep the clock monotonous even when ntpd is fixing drift. @@ -129,14 +137,14 @@ void do_gettimeofday(struct timeval *tv) in arch/x86_64/kernel/vsyscall.c and export all needed variables in vmlinux.lds. -AK */ - t = (jiffies - wall_jiffies) * (1000000L / HZ) + + t = (jiffies - wall_jiffies) * USEC_PER_TICK + do_gettimeoffset(); usec += t; } while (read_seqretry(&xtime_lock, seq)); - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; + tv->tv_sec = sec + usec / USEC_PER_SEC; + tv->tv_usec = usec % USEC_PER_SEC; } EXPORT_SYMBOL(do_gettimeofday); @@ -157,8 +165,8 @@ int do_settimeofday(struct timespec *tv) write_seqlock_irq(&xtime_lock); - nsec -= do_gettimeoffset() * 1000 + - (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ); + nsec -= do_gettimeoffset() * NSEC_PER_USEC + + (jiffies - wall_jiffies) * NSEC_PER_TICK; wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); @@ -179,20 +187,15 @@ unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); - /* Assume the lock function has either no stack frame or only a single - word. This checks if the address on the stack looks like a kernel - text address. - There is a small window for false hits, but in that case the tick - is just accounted to the spinlock function. - Better would be to write these functions in assembler again - and check exactly. */ - if (in_lock_functions(pc)) { - char *v = *(char **)regs->rsp; - if ((v >= _stext && v <= _etext) || - (v >= _sinittext && v <= _einittext) || - (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END)) - return (unsigned long)v; - return ((unsigned long *)regs->rsp)[1]; + /* Assume the lock function has either no stack frame or a copy + of eflags from PUSHF + Eflags always has bits 22 and up cleared unlike kernel addresses. */ + if (!user_mode(regs) && in_lock_functions(pc)) { + unsigned long *sp = (unsigned long *)regs->rsp; + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; } return pc; } @@ -273,6 +276,7 @@ static void set_rtc_mmss(unsigned long nowtime) * Note: This function is required to return accurate * time even in the absence of multiple timer ticks. */ +static inline unsigned long long cycles_2_ns(unsigned long long cyc); unsigned long long monotonic_clock(void) { unsigned long seq; @@ -288,7 +292,7 @@ unsigned long long monotonic_clock(void) this_offset = hpet_readl(HPET_COUNTER); } while (read_seqretry(&xtime_lock, seq)); offset = (this_offset - last_offset); - offset *= (NSEC_PER_SEC/HZ) / hpet_tick; + offset *= NSEC_PER_TICK / hpet_tick; } else { do { seq = read_seqbegin(&xtime_lock); @@ -297,7 +301,7 @@ unsigned long long monotonic_clock(void) base = monotonic_base; } while (read_seqretry(&xtime_lock, seq)); this_offset = get_cycles_sync(); - offset = (this_offset - last_offset)*1000 / cpu_khz; + offset = cycles_2_ns(this_offset - last_offset); } return base + offset; } @@ -382,7 +386,7 @@ void main_timer_handler(struct pt_regs *regs) } monotonic_base += - (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; + (offset - vxtime.last) * NSEC_PER_TICK / hpet_tick; vxtime.last = offset; #ifdef CONFIG_X86_PM_TIMER @@ -391,36 +395,36 @@ void main_timer_handler(struct pt_regs *regs) #endif } else { offset = (((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); + vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK; if (offset < 0) offset = 0; - if (offset > (USEC_PER_SEC / HZ)) { - lost = offset / (USEC_PER_SEC / HZ); - offset %= (USEC_PER_SEC / HZ); + if (offset > USEC_PER_TICK) { + lost = offset / USEC_PER_TICK; + offset %= USEC_PER_TICK; } - monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ; + monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc); vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; if ((((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> 32) < offset) + vxtime.tsc_quot) >> US_SCALE) < offset) vxtime.last_tsc = tsc - - (((long) offset << 32) / vxtime.tsc_quot) - 1; + (((long) offset << US_SCALE) / vxtime.tsc_quot) - 1; } - if (lost > 0) { + if (lost > 0) handle_lost_ticks(lost, regs); - jiffies += lost; - } + else + lost = 0; /* * Do the timer stuff. */ - do_timer(regs); + do_timer(lost + 1); #ifndef CONFIG_SMP update_process_times(user_mode(regs)); #endif @@ -431,12 +435,8 @@ void main_timer_handler(struct pt_regs *regs) * have to call the local interrupt handler. */ -#ifndef CONFIG_X86_LOCAL_APIC - profile_tick(CPU_PROFILING, regs); -#else if (!using_apic_timer) smp_local_timer_interrupt(regs); -#endif /* * If we have an externally synchronized Linux clock, then update CMOS clock @@ -460,24 +460,21 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) if (apic_runs_main_timer > 1) return IRQ_HANDLED; main_timer_handler(regs); -#ifdef CONFIG_X86_LOCAL_APIC if (using_apic_timer) smp_send_timer_broadcast_ipi(); -#endif return IRQ_HANDLED; } static unsigned int cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ static inline void set_cyc2ns_scale(unsigned long cpu_khz) { - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; + cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz; } static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; + return (cyc * cyc2ns_scale) >> NS_SCALE; } unsigned long long sched_clock(void) @@ -490,7 +487,7 @@ unsigned long long sched_clock(void) Disadvantage is a small drift between CPUs in some configurations, but that should be tolerable. */ if (__vxtime.mode == VXTIME_HPET) - return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32; + return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE; #endif /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, @@ -633,7 +630,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - vxtime.tsc_quot = (1000L << 32) / cpu_khz; + vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; } set_cyc2ns_scale(cpu_khz_ref); @@ -789,8 +786,8 @@ static int hpet_timer_stop_set_go(unsigned long tick) if (hpet_use_timer) { hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); - hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ + hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ + hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ cfg |= HPET_CFG_LEGACY; } /* @@ -825,8 +822,7 @@ static int hpet_init(void) if (hpet_period < 100000 || hpet_period > 100000000) return -1; - hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) / - hpet_period; + hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; hpet_use_timer = (id & HPET_ID_LEGSUP); @@ -882,26 +878,20 @@ int __init time_setup(char *str) } static struct irqaction irq0 = { - timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL + timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL }; +static int __cpuinit +time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu) +{ + unsigned cpu = (unsigned long) hcpu; + if (action == CPU_ONLINE) + vsyscall_set_cpu(cpu); + return NOTIFY_DONE; +} + void __init time_init(void) { - char *timename; - char *gtod; - -#ifdef HPET_HACK_ENABLE_DANGEROUS - if (!vxtime.hpet_address) { - printk(KERN_WARNING "time.c: WARNING: Enabling HPET base " - "manually!\n"); - outl(0x800038a0, 0xcf8); - outl(0xff000001, 0xcfc); - outl(0x800038a0, 0xcf8); - vxtime.hpet_address = inl(0xcfc) & 0xfffffffe; - printk(KERN_WARNING "time.c: WARNING: Enabled HPET " - "at %#lx.\n", vxtime.hpet_address); - } -#endif if (nohpet) vxtime.hpet_address = 0; @@ -912,7 +902,7 @@ void __init time_init(void) -xtime.tv_sec, -xtime.tv_nsec); if (!hpet_init()) - vxtime_hz = (1000000000000000L + hpet_period / 2) / hpet_period; + vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period; else vxtime.hpet_address = 0; @@ -935,18 +925,17 @@ void __init time_init(void) } vxtime.mode = VXTIME_TSC; - gtod = time_init_gtod(); - - printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", - vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod); - printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - vxtime.quot = (1000000L << 32) / vxtime_hz; - vxtime.tsc_quot = (1000L << 32) / cpu_khz; + vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; + vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; vxtime.last_tsc = get_cycles_sync(); + set_cyc2ns_scale(cpu_khz); setup_irq(0, &irq0); + hotcpu_notifier(time_cpu_notifier, 0); + time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id()); - set_cyc2ns_scale(cpu_khz); +#ifndef CONFIG_SMP + time_init_gtod(); +#endif } /* @@ -956,13 +945,20 @@ void __init time_init(void) __cpuinit int unsynchronized_tsc(void) { #ifdef CONFIG_SMP - if (oem_force_hpet_timer()) + if (apic_is_clustered_box()) return 1; - /* Intel systems are normally all synchronized. Exceptions - are handled in the OEM check above. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - return 0; #endif + /* Most intel systems have synchronized TSCs except for + multi node systems */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { +#ifdef CONFIG_ACPI + /* But TSC doesn't tick in C3 so don't use it there */ + if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 100) + return 1; +#endif + return 0; + } + /* Assume multi socket systems are not synchronized */ return num_present_cpus() > 1; } @@ -970,12 +966,18 @@ __cpuinit int unsynchronized_tsc(void) /* * Decide what mode gettimeofday should use. */ -__init static char *time_init_gtod(void) +void time_init_gtod(void) { char *timetype; if (unsynchronized_tsc()) notsc = 1; + + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; + if (vxtime.hpet_address && notsc) { timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; if (hpet_use_timer) @@ -998,7 +1000,16 @@ __init static char *time_init_gtod(void) timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; vxtime.mode = VXTIME_TSC; } - return timetype; + + printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", + vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype); + printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; + vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; + vxtime.last_tsc = get_cycles_sync(); + + set_cyc2ns_scale(cpu_khz); } __setup("report_lost_ticks", time_setup); @@ -1028,8 +1039,16 @@ static int timer_resume(struct sys_device *dev) unsigned long flags; unsigned long sec; unsigned long ctime = get_cmos_time(); - unsigned long sleep_length = (ctime - sleep_start) * HZ; + long sleep_length = (ctime - sleep_start) * HZ; + if (sleep_length < 0) { + printk(KERN_WARNING "Time skew detected in timer resume!\n"); + /* The time after the resume must not be earlier than the time + * before the suspend or some nasty things will happen + */ + sleep_length = 0; + ctime = sleep_start; + } if (vxtime.hpet_address) hpet_reenable(); else @@ -1145,23 +1164,25 @@ int hpet_rtc_timer_init(void) hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; local_irq_save(flags); + cnt = hpet_readl(HPET_COUNTER); cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); hpet_writel(cnt, HPET_T1_CMP); hpet_t1_cmp = cnt; - local_irq_restore(flags); cfg = hpet_readl(HPET_T1_CFG); cfg &= ~HPET_TN_PERIODIC; cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; hpet_writel(cfg, HPET_T1_CFG); + local_irq_restore(flags); + return 1; } static void hpet_rtc_timer_reinit(void) { - unsigned int cfg, cnt; + unsigned int cfg, cnt, ticks_per_int, lost_ints; if (unlikely(!(PIE_on | AIE_on | UIE_on))) { cfg = hpet_readl(HPET_T1_CFG); @@ -1176,10 +1197,33 @@ static void hpet_rtc_timer_reinit(void) hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; /* It is more accurate to use the comparator value than current count.*/ - cnt = hpet_t1_cmp; - cnt += hpet_tick*HZ/hpet_rtc_int_freq; - hpet_writel(cnt, HPET_T1_CMP); - hpet_t1_cmp = cnt; + ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq; + hpet_t1_cmp += ticks_per_int; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + + /* + * If the interrupt handler was delayed too long, the write above tries + * to schedule the next interrupt in the past and the hardware would + * not interrupt until the counter had wrapped around. + * So we have to check that the comparator wasn't set to a past time. + */ + cnt = hpet_readl(HPET_COUNTER); + if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) { + lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1; + /* Make sure that, even with the time needed to execute + * this code, the next scheduled interrupt has been moved + * back to the future: */ + lost_ints++; + + hpet_t1_cmp += lost_ints * ticks_per_int; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + + if (PIE_on) + PIE_count += lost_ints; + + printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", + hpet_rtc_int_freq); + } } /* |